In [None]:
#| default_exp handlers.tepco

# TEPCO 
> Data pipeline (handler) to convert TEPCO dataset ([Source](https://radioactivity.nsr.go.jp/ja/list/349/list-1.html)) to `NetCDF` format

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#| export
import warnings
warnings.filterwarnings('ignore')

In [None]:
#| export
import pandas as pd
import re
import numpy as np
import fastcore.all as fc
from tqdm import tqdm
from collections import defaultdict

from marisco.callbacks import (
    Callback, 
    Transformer,
    EncodeTimeCB, 
    SanitizeLonLatCB,
    EncodeTimeCB, 
    )

from marisco.configs import cfg
from marisco.encoders import NetCDFEncoder

from marisco.metadata import (
    GlobAttrsFeeder, 
    BboxCB,
    TimeRangeCB,
    ZoteroCB, 
    KeyValuePairCB    
    )

from marisco.netcdf2csv import decode

## Configuration & file paths

In [None]:
#| exports
fname_coastal_water = 'https://radioactivity.nra.go.jp/cont/en/results/sea/coastal_water.csv'
fname_clos1F = 'https://radioactivity.nra.go.jp/cont/en/results/sea/close1F_water.xlsx'
fname_iaea_orbs = 'https://raw.githubusercontent.com/RML-IAEA/iaea.orbs/refs/heads/main/src/iaea/orbs/stations/station_points.csv'

fname_out = '../../_data/output/tepco.nc'

## Load data

We here load the data from the [NRA (Nuclear Regulatory Authority)](https://radioactivity.nra.go.jp/en/results) website. For the moment, we only process radioactivity concentration data in the seawater around Fukushima Dai-ichi NPP [TEPCO] (`coastal_water.csv`) and in the `close1F_water.xlsx` file.

In near future, MARIS will provide a dedicated handler for all related [ALPS data](https://radioactivity.nra.go.jp/en/results#sec-12) including measurements not only provided by TEPCO but also MOE, NRA, MLITT and Fukushima Prefecture.



:::{.callout-important}
## FEEDBACK TO DATA PROVIDER

The **coastal_water.csv** file contains two sections: the measurements and the locations. We identify below the line number where the locations begin. A single point of truth for the location of the stations would ease the processing in future.

:::

In [None]:
#| exports
def find_location_section(df, 
                          col_idx=0,
                          pattern='Sampling point number'
                          ):
    "Find the line number where location data begins."
    mask = df.iloc[:, col_idx] == pattern
    indices = df[mask].index
    return indices[0] if len(indices) > 0 else -1

In [None]:
#| eval: false
find_location_section(pd.read_csv(fname_coastal_water, low_memory=False))

np.int64(28039)

:::{.callout-important}
## FEEDBACK TO DATA PROVIDER

Distinct parsing of the time from `coastal_water.csv` and `close1F_water.xlsx` files are required. Indeed:

- `coastal_water.csv` uses the format `YYYY/MM/DD` in the `Sampling  HH:MM` and 
- `close1F_water.xlsx` uses the format `YYYY-MM-DD HH:MM:SS`.

:::

In [None]:
#| exports
def fix_sampling_time(x):
    if pd.isna(x): 
        return '00:00:00'
    else:
        hour, min =  x.split(':')[:2]
        return f"{hour if len(hour) == 2 else '0' + hour}:{min}:00"

In [None]:
#| exports
def get_coastal_water_df(fname_coastal_water):
    "Get the measurements dataframe from the `coastal_water.csv` file."
    
    locs_idx = find_location_section(pd.read_csv(fname_coastal_water, 
                                      skiprows=0, low_memory=False))
    
    df = pd.read_csv(fname_coastal_water, skiprows=1, 
                     nrows=locs_idx - 1,
                     low_memory=False)
    df.dropna(subset=['Sampling point number'], inplace=True)
    df['Sampling time'] = df['Sampling time'].map(fix_sampling_time)
    
    df['TIME'] = df['Sampling date'].replace('-', '/') + ' ' + df['Sampling time']
    
    df = df.drop(columns=['Sampling date', 'Sampling time'])
    return df

In [None]:
#| eval: false
df_coastal_water = get_coastal_water_df(fname_coastal_water)
df_coastal_water.tail()

Unnamed: 0,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),132I detection limit (Bq/L),...,54Mn radioactivity concentration (Bq/L),54Mn detection limit (Bq/L),3H radioactivity concentration (Bq/L),3H detection limit (Bq/L),125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,TIME
28031,T-S3,上層,,,,,,,,,...,,,ND,6.9,,,,,,2025/1/8 09:36:00
28032,T-S4,上層,,,,,,,,,...,,,ND,6.9,,,,,,2025/1/8 09:59:00
28033,T-D5,上層,,,,,,,,,...,,,ND,6.3,,,,,,2025/1/13 07:44:00
28034,T-S8,上層,,,,,,,,,...,,,ND,6.6,,,,,,2025/1/15 05:22:00
28035,T-D5,上層,,,,,,,,,...,,,ND,7.4,,,,,,2025/1/20 07:57:00


:::{.callout-important}
## FEEDBACK TO DATA PROVIDER

Identification of the stations location requires three distinct files:

- the second section of the `coastal_water.csv` file
- the `R6zahyo.pdf` file further processed by [https://github.com/RML-IAEA/iaea.orbs](https://github.com/RML-IAEA/iaea.orbs)
- the second sections of all sheets of `close1F_water.xlsx` file
  
All files and sheets required to look up the location of the stations.

:::

In [None]:
#| exports
def get_locs_coastal_water(fname_coastal_water):
    locs_idx = find_location_section(pd.read_csv(fname_coastal_water, 
                                      skiprows=0, low_memory=False))
    
    df = pd.read_csv(fname_coastal_water, skiprows=locs_idx+1, 
                     low_memory=False).iloc[:, :3]
    
    df.columns = ['STATION', 'LON', 'LAT']
    df.dropna(subset=['LAT'], inplace=True)
    df['org'] = 'coastal_seawater.csv'
    return df

In [None]:
#| eval: false
df_locs_coastal_water = get_locs_coastal_water(fname_coastal_water)
print(f'Nb. of stations: {len(df_locs_coastal_water)}')
df_locs_coastal_water.head()

Nb. of stations: 48


Unnamed: 0,STATION,LON,LAT,org
0,T-0,37.42,141.04,coastal_seawater.csv
1,T-11,37.24,141.05,coastal_seawater.csv
2,T-12,37.15,141.04,coastal_seawater.csv
3,T-13-1,37.64,141.04,coastal_seawater.csv
4,T-14,37.55,141.06,coastal_seawater.csv


In [None]:
#| eval: false
df_locs_coastal_water.STATION.unique()

array(['T-0', 'T-11', 'T-12', 'T-13-1', 'T-14', 'T-17-1', 'T-18', 'T-20',
       'T-22', 'T-3', 'T-4', 'T-4-1', 'T-4-2', 'T-5', 'T-6', 'T-7', 'T-A',
       'T-B', 'T-B1', 'T-B2', 'T-B3', 'T-B4', 'T-C', 'T-D', 'T-D1',
       'T-D5', 'T-D9', 'T-E', 'T-E1', 'T-Z', 'T-MG6', 'T-S1', 'T-S7',
       'T-H1', 'T-S2', 'T-S6', 'T-M10', 'T-MA', 'T-S3', 'T-S4', 'T-S8',
       'T-MG4', 'T-G4', 'T-MG5', 'T-MG1', 'T-MG0', 'T-MG3', 'T-MG2'],
      dtype=object)

:::{.callout-important}
## FEEDBACK TO DATA PROVIDER

Data contained in the `close1F_water.xlsx` file are spread in several sheets (one per station). Each sheet further contains two sections: the measurements and the locations. 

For each sheet, we have to identify the line number where to split both measurements and the location. We then need to further iterate over all sheets to concatenate the results.

:::

In [None]:
#| exports
def get_clos1F_df(fname_clos1F):
    "Get measurements dataframe from close1F_water.xlsx file and parse datetime."
    excel_file = pd.ExcelFile(fname_clos1F)
    dfs = {}
    
    for sheet_name in tqdm(excel_file.sheet_names):
        locs_idx = find_location_section(pd.read_excel(excel_file, 
                                                       sheet_name=sheet_name,
                                                       skiprows=1))
        df = pd.read_excel(excel_file, 
                   sheet_name=sheet_name, 
                   skiprows=1,
                   nrows=locs_idx-1)
        
        df.dropna(subset=['Sampling point number'], inplace=True)
        df['Sampling date'] = df['Sampling date']\
            .astype(str)\
            .apply(lambda x: x.split(' ')[0]\
            .replace('-', '/'))
            
        dfs[sheet_name] = df
    
    df = pd.concat(dfs.values(), ignore_index=True)
    df.dropna(subset=['Sampling date'], inplace=True)
    df['TIME'] = df['Sampling date'] + ' ' + df['Sampling time'].astype(str)
    df = df.drop(columns=['Sampling date', 'Sampling time'])
    return df

In [None]:
#| eval: false
df_clos1F = get_clos1F_df(fname_clos1F)
df_clos1F.head()

100%|██████████| 11/11 [00:04<00:00,  2.27it/s]


Unnamed: 0,Sampling point number,134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),Total beta radioactivity concentration (Bq/L),Total beta detection limit (Bq/L),3H radioactivity concentration (Bq/L),3H detection limit (Bq/L),Collection layer of seawater,...,106Ru detection limit (Bq/L),60Co radioactivity concentration (Bq/L),60Co detection limit (Bq/L),95Zr radioactivity concentration (Bq/L),95Zr detection limit (Bq/L),99Mo radioactivity concentration (Bq/L),99Mo detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),TIME
0,T-0-1,ND,1.5,ND,1.4,ND,18.0,,,,...,,,,,,,,,,2013/08/14 08:17:00
1,T-0-1,,,,,,,4.7,,,...,,,,,,,,,,2013/08/14 08:17:00
2,T-0-1,ND,1.1,ND,1.4,ND,20.0,,,,...,,,,,,,,,,2013/08/21 08:09:00
3,T-0-1,,,,,,,ND,2.9,,...,,,,,,,,,,2013/08/21 08:09:00
4,T-0-1,ND,0.66,ND,0.49,ND,17.0,,,,...,,,,,,,,,,2013/08/27 08:14:00


In [None]:
#| eval: false
df_clos1F['Sampling point number'].unique()

array(['T-0-1', 'T-0-1A', 'T-0-2', 'T-0-3', 'T-0-3A', 'T-1', 'T-2',
       'T-2-1', 'T-A1', 'T-A2', 'T-A3'], dtype=object)

In [None]:
#| exports
def get_locs_clos1F(fname_clos1F):
    "Get locations dataframe from close1F_water.xlsx file from each sheets."
    excel_file = pd.ExcelFile(fname_clos1F)
    dfs = {}
    
    for sheet_name in tqdm(excel_file.sheet_names):
        locs_idx = find_location_section(pd.read_excel(excel_file, 
                                                       sheet_name=sheet_name,
                                                       skiprows=1))
        df = pd.read_excel(excel_file, 
                           sheet_name=sheet_name, 
                           skiprows=locs_idx+2)
            
        dfs[sheet_name] = df
    
    df = pd.concat(dfs.values(), ignore_index=True).iloc[:, :3]
    df.dropna(subset=['Sampling coordinate North latitude (Decimal)'], inplace=True)    
    df.columns = ['STATION', 'LON', 'LAT']
    df['org'] = 'close1F.csv'
    return df

In [None]:
#| eval: false
df_locs_clos1F = get_locs_clos1F(fname_clos1F)
print(f'Nb. of stations: {len(df_locs_clos1F)}')
df_locs_clos1F.head()

100%|██████████| 11/11 [00:05<00:00,  1.96it/s]

Nb. of stations: 11





Unnamed: 0,STATION,LON,LAT,org
0,T-0-1,37.43,141.04,close1F.csv
11,T-0-1A,37.43,141.05,close1F.csv
22,T-0-2,37.42,141.05,close1F.csv
33,T-0-3,37.42,141.04,close1F.csv
44,T-0-3A,37.42,141.05,close1F.csv


:::{.callout-important}
## FEEDBACK TO DATA PROVIDER

The `close1F_water.xlsx` file contains station locations that are not present in the `coastal_water.csv` dataset, as demonstrated in the comparison below:
:::

In [None]:
#| eval: false
set(df_locs_clos1F.STATION) - set(df_locs_coastal_water.STATION)

{'T-0-1',
 'T-0-1A',
 'T-0-2',
 'T-0-3',
 'T-0-3A',
 'T-1',
 'T-2',
 'T-2-1',
 'T-A1',
 'T-A2',
 'T-A3'}

:::{.callout-important}
## FEEDBACK TO DATA PROVIDER

In theory all locations are supposed to be provided in the [R6zahyo.pdf](https://radioactivity.nra.go.jp/cont/en/results/sea/R6zahyo.pdf) file. This file is further processed by https://github.com/RML-IAEA/iaea.orbs and the result is provided in the `station_points.csv` file. 

However, this file lacks complete coverage of locations referenced in both `coastal_water.csv` and `close1F_water.xlsx` files, while simultaneously containing additional locations not present in either (see below). A more standardized and comprehensive location reference system would significantly improve the efficiency and reliability of the data ingestion process.

:::

In [None]:
#| exports
def get_locs_orbs(fname_iaea_orbs):
    df = pd.read_csv(fname_iaea_orbs)
    df.columns = ['org', 'STATION', 'LON', 'LAT']
    return df

In [None]:
#| eval: false
df_locs_orbs = get_locs_orbs(fname_iaea_orbs)
df_locs_orbs.head()

Unnamed: 0,org,STATION,LON,LAT
0,MOE,E-31,141.727667,39.059167
1,MOE,E-32,141.635667,38.996
2,MOE,E-37,141.948611,39.259167
3,MOE,E-38,141.755,39.008333
4,MOE,E-39,141.766667,38.991667


In [None]:
#| eval: False
set(df_locs_orbs.STATION) - (set(df_locs_clos1F.STATION) | set(df_locs_coastal_water.STATION))

{'C-P1',
 'C-P2',
 'C-P3',
 'C-P4',
 'C-P5',
 'C-P8',
 'E-31',
 'E-32',
 'E-37',
 'E-38',
 'E-39',
 'E-3A',
 'E-41',
 'E-42',
 'E-43',
 'E-44',
 'E-45',
 'E-46',
 'E-47',
 'E-48',
 'E-49',
 'E-4A',
 'E-4B',
 'E-4C',
 'E-4F',
 'E-4G',
 'E-4H',
 'E-4J',
 'E-4K',
 'E-4L',
 'E-4M',
 'E-71',
 'E-72',
 'E-73',
 'E-74',
 'E-75',
 'E-76',
 'E-77',
 'E-78',
 'E-79',
 'E-7A',
 'E-7B',
 'E-7C',
 'E-7D',
 'E-7F',
 'E-7G',
 'E-7H',
 'E-7I',
 'E-7J',
 'E-7K',
 'E-7L',
 'E-81',
 'E-82',
 'E-83',
 'E-84',
 'E-85',
 'E-S1',
 'E-S10',
 'E-S13',
 'E-S14',
 'E-S15',
 'E-S17',
 'E-S18',
 'E-S19',
 'E-S20',
 'E-S21',
 'E-S22',
 'E-S23',
 'E-S24',
 'E-S25',
 'E-S26',
 'E-S27',
 'E-S28',
 'E-S29',
 'E-S3',
 'E-S30',
 'E-S31',
 'E-S32',
 'E-S33',
 'E-S34',
 'E-S35',
 'E-S36',
 'E-S4',
 'E-S5',
 'E-T1',
 'E-T2',
 'E-T3',
 'E-T4',
 'E-T5',
 'E-T6',
 'E-T7',
 'E-T8',
 'F-P01',
 'F-P02',
 'F-P03',
 'F-P04',
 'F-P05',
 'F-P06',
 'F-P07',
 'F-P08',
 'F-P09',
 'F-P10',
 'F-P11',
 'F-P12',
 'F-P13',
 'F-P14',
 'F-P15'

In [None]:
#| exports
def concat_locs(dfs):
    "Concatenate and drop duplicates from coastal_seawater.csv and iaea_orbs.csv (kept)"
    df = pd.concat(dfs)
    # Group by org to be used for sorting
    df['org_grp'] = df['org'].apply(
        lambda x: 1 if x == 'coastal_seawater.csv' else 2 if x == 'close1F.csv' else 0)
    df.sort_values('org_grp', ascending=True, inplace=True)
    # Drop duplicates and keep orbs data first
    df.drop_duplicates(subset='STATION', keep='first', inplace=True)
    df.drop(columns=['org_grp'], inplace=True)
    df.sort_values('STATION', ascending=True, inplace=True)
    return df

In [None]:
#| eval: false
df_locs = concat_locs([df_locs_clos1F, df_locs_coastal_water, df_locs_orbs])
df_locs.head()

Unnamed: 0,STATION,LON,LAT,org
214,C-P1,139.863333,35.425,NRA
215,C-P2,139.863333,35.401667,NRA
216,C-P3,139.881667,35.37,NRA
217,C-P4,139.846667,35.356667,NRA
218,C-P5,139.8,35.343333,NRA


In [None]:
#| exports
def align_dfs(df_from, df_to):
    "Align columns structure of df_from to df_to."
    df = defaultdict()    
    for c in df_to.columns:
        df[c] = df_from[c].values if c in df_from.columns else np.nan
    return pd.DataFrame(df)

In [None]:
# | eval: false
align_dfs(df_clos1F, df_coastal_water).head()

Unnamed: 0,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),132I detection limit (Bq/L),...,54Mn radioactivity concentration (Bq/L),54Mn detection limit (Bq/L),3H radioactivity concentration (Bq/L),3H detection limit (Bq/L),125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,TIME
0,T-0-1,,,,ND,1.5,ND,1.4,,,...,,,,,,,,,,2013/08/14 08:17:00
1,T-0-1,,,,,,,,,,...,,,4.7,,,,,,,2013/08/14 08:17:00
2,T-0-1,,,,ND,1.1,ND,1.4,,,...,,,,,,,,,,2013/08/21 08:09:00
3,T-0-1,,,,,,,,,,...,,,ND,2.9,,,,,,2013/08/21 08:09:00
4,T-0-1,,,,ND,0.66,ND,0.49,,,...,,,,,,,,,,2013/08/27 08:14:00


In [None]:
#| exports
def concat_dfs(df_coastal_water, df_clos1F):
    "Concatenate and drop duplicates from coastal_seawater.csv and close1F_water.xlsx (kept)"
    df_clos1F = align_dfs(df_clos1F, df_coastal_water)
    df = pd.concat([df_coastal_water, df_clos1F])
    return df

In [None]:
#| eval: false
df_meas = concat_dfs(df_coastal_water, df_clos1F)
df_meas.head()

Unnamed: 0,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),132I detection limit (Bq/L),...,54Mn radioactivity concentration (Bq/L),54Mn detection limit (Bq/L),3H radioactivity concentration (Bq/L),3H detection limit (Bq/L),125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,TIME
0,T-3,,1100.0,13.0,48.0,9.2,53.0,8.8,1.6E+02,44.0,...,,,,,,,,,,2011/3/21 23:15:00
1,T-4,,660.0,12.0,31.0,8.7,33.0,8.3,1.2E+02,41.0,...,,,,,,,,,,2011/3/21 23:45:00
2,T-3,,1100.0,20.0,46.0,14.0,40.0,14.0,ND,88.0,...,,,,,,,,,,2011/3/22 14:28:00
3,T-4,,670.0,19.0,39.0,11.0,44.0,11.0,ND,79.0,...,,,,,,,,,,2011/3/22 15:06:00
4,T-3,,740.0,27.0,51.0,20.0,55.0,20.0,2.0E+02,58.0,...,,,,,,,34.0,25.0,,2011/3/23 13:51:00


In [None]:
#| exports
def georef_data(df_meas, df_locs):
    "Georeference measurements dataframe using locations dataframe."
    assert "Sampling point number" in df_meas.columns and "STATION" in df_locs.columns
    return pd.merge(df_meas, df_locs, how="inner", 
                    left_on='Sampling point number', right_on='STATION')

In [None]:
#| eval: false
df_meas_georef = georef_data(df_meas, df_locs)
df_meas_georef.head()

Unnamed: 0,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),132I detection limit (Bq/L),...,125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,TIME,STATION,LON,LAT,org
0,T-3,,1100.0,13.0,48.0,9.2,53.0,8.8,1.6E+02,44.0,...,,,,,,2011/3/21 23:15:00,T-3,141.026389,37.322222,TEPCO
1,T-4,,660.0,12.0,31.0,8.7,33.0,8.3,1.2E+02,41.0,...,,,,,,2011/3/21 23:45:00,T-4,141.013889,37.241667,TEPCO
2,T-3,,1100.0,20.0,46.0,14.0,40.0,14.0,ND,88.0,...,,,,,,2011/3/22 14:28:00,T-3,141.026389,37.322222,TEPCO
3,T-4,,670.0,19.0,39.0,11.0,44.0,11.0,ND,79.0,...,,,,,,2011/3/22 15:06:00,T-4,141.013889,37.241667,TEPCO
4,T-3,,740.0,27.0,51.0,20.0,55.0,20.0,2.0E+02,58.0,...,,,34.0,25.0,,2011/3/23 13:51:00,T-3,141.026389,37.322222,TEPCO


In [None]:
#| exports
def load_data(fname_coastal_water, fname_clos1F, fname_iaea_orbs):
    "Load, align and georeference TEPCO data"
    df_locs = concat_locs(
        [get_locs_coastal_water(fname_coastal_water), 
         get_locs_clos1F(fname_clos1F),
         get_locs_orbs(fname_iaea_orbs)])
    df_meas = concat_dfs(get_coastal_water_df(fname_coastal_water), get_clos1F_df(fname_clos1F))
    df_meas.dropna(subset=['Sampling point number'], inplace=True)
    return {'SEAWATER': georef_data(df_meas, df_locs)}

In [None]:
#| eval: false
dfs = load_data(fname_coastal_water, fname_clos1F, fname_iaea_orbs)
dfs['SEAWATER'].head()

100%|██████████| 11/11 [00:04<00:00,  2.35it/s]
100%|██████████| 11/11 [00:05<00:00,  1.98it/s]


Unnamed: 0,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),132I detection limit (Bq/L),...,125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,TIME,STATION,LON,LAT,org
0,T-3,,1100.0,13.0,48.0,9.2,53.0,8.8,1.6E+02,44.0,...,,,,,,2011/3/21 23:15:00,T-3,141.026389,37.322222,TEPCO
1,T-4,,660.0,12.0,31.0,8.7,33.0,8.3,1.2E+02,41.0,...,,,,,,2011/3/21 23:45:00,T-4,141.013889,37.241667,TEPCO
2,T-3,,1100.0,20.0,46.0,14.0,40.0,14.0,ND,88.0,...,,,,,,2011/3/22 14:28:00,T-3,141.026389,37.322222,TEPCO
3,T-4,,670.0,19.0,39.0,11.0,44.0,11.0,ND,79.0,...,,,,,,2011/3/22 15:06:00,T-4,141.013889,37.241667,TEPCO
4,T-3,,740.0,27.0,51.0,20.0,55.0,20.0,2.0E+02,58.0,...,,,34.0,25.0,,2011/3/23 13:51:00,T-3,141.026389,37.322222,TEPCO


In [None]:
#| eval: false
print(f"# of cols, rows: {dfs['SEAWATER'].shape}")
dfs['SEAWATER'].head()

# of cols, rows: (47526, 53)


Unnamed: 0,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),132I detection limit (Bq/L),...,125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,TIME,STATION,LON,LAT,org
0,T-3,,1100.0,13.0,48.0,9.2,53.0,8.8,1.6E+02,44.0,...,,,,,,2011/3/21 23:15:00,T-3,141.026389,37.322222,TEPCO
1,T-4,,660.0,12.0,31.0,8.7,33.0,8.3,1.2E+02,41.0,...,,,,,,2011/3/21 23:45:00,T-4,141.013889,37.241667,TEPCO
2,T-3,,1100.0,20.0,46.0,14.0,40.0,14.0,ND,88.0,...,,,,,,2011/3/22 14:28:00,T-3,141.026389,37.322222,TEPCO
3,T-4,,670.0,19.0,39.0,11.0,44.0,11.0,ND,79.0,...,,,,,,2011/3/22 15:06:00,T-4,141.013889,37.241667,TEPCO
4,T-3,,740.0,27.0,51.0,20.0,55.0,20.0,2.0E+02,58.0,...,,,34.0,25.0,,2011/3/23 13:51:00,T-3,141.026389,37.322222,TEPCO


In [None]:
#| eval: false
dfs['SEAWATER'].STATION.unique()

array(['T-3', 'T-4', 'T-5', 'T-7', 'T-11', 'T-12', 'T-14', 'T-18', 'T-20',
       'T-22', 'T-MA', 'T-M10', 'T-A', 'T-D', 'T-E', 'T-B', 'T-C',
       'T-MG1', 'T-MG2', 'T-MG3', 'T-MG4', 'T-MG5', 'T-MG6', 'T-D1',
       'T-D5', 'T-D9', 'T-E1', 'T-G4', 'T-H1', 'T-S5', 'T-S6', 'T-17-1',
       'T-B3', 'T-13-1', 'T-S3', 'T-S4', 'T-B4', 'T-S1', 'T-S2', 'T-MG0',
       'T-Z', 'T-B1', 'T-B2', 'T-S7', 'T-S8', 'T-0', 'T-4-1', 'T-4-2',
       'T-6', 'T-0-1', 'T-0-1A', 'T-0-2', 'T-0-3', 'T-0-3A', 'T-1', 'T-2',
       'T-2-1', 'T-A1', 'T-A2', 'T-A3'], dtype=object)

## Fix missing values

:::{.callout-important}
## FEEDBACK TO DATA PROVIDER

We remap the `ND` value to `NaN`. Please confirm that this is the correct way to handle missing values.
:::


`ND` is assigned `NaN`. This needs to be confirmed.

In [None]:
#| exports
class FixMissingValuesCB(Callback):
    "Assign `NaN` to values equal to `ND` (not detected) - to be confirmed "
    def __call__(self, tfm): 
        for k in tfm.dfs.keys():
            predicate = tfm.dfs[k] == 'ND'
            tfm.dfs[k][predicate] = np.nan

In [None]:
#| eval: false
tfm = Transformer(dfs, cbs=[FixMissingValuesCB()])
tfm()['SEAWATER'].head()

Unnamed: 0,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),132I detection limit (Bq/L),...,125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,TIME,STATION,LON,LAT,org
0,T-3,,1100.0,13.0,48.0,9.2,53.0,8.8,160.0,44.0,...,,,,,,2011/3/21 23:15:00,T-3,141.026389,37.322222,TEPCO
1,T-4,,660.0,12.0,31.0,8.7,33.0,8.3,120.0,41.0,...,,,,,,2011/3/21 23:45:00,T-4,141.013889,37.241667,TEPCO
2,T-3,,1100.0,20.0,46.0,14.0,40.0,14.0,,88.0,...,,,,,,2011/3/22 14:28:00,T-3,141.026389,37.322222,TEPCO
3,T-4,,670.0,19.0,39.0,11.0,44.0,11.0,,79.0,...,,,,,,2011/3/22 15:06:00,T-4,141.013889,37.241667,TEPCO
4,T-3,,740.0,27.0,51.0,20.0,55.0,20.0,200.0,58.0,...,,,34.0,25.0,,2011/3/23 13:51:00,T-3,141.026389,37.322222,TEPCO


## Remove 約 (about) character
    

:::{.callout-important}
## FEEDBACK TO DATA PROVIDER

We systematically remove the `約` character. Please confirm that this is the correct way to handle this. We could imagine that mentioning uncertainty would be less ambiguous in future.

:::

In [None]:
#| exports
class RemoveJapanaseCharCB(Callback):
    "Remove 約 (about) char"
    def _transform_if_about(self, value, about_char='約'):
        if pd.isna(value): return value
        return (value.replace(about_char, '') if str(value).count(about_char) != 0 
                else value)
    
    def __call__(self, tfm): 
        for k in tfm.dfs.keys():
            cols_rdn = [c for c in tfm.dfs[k].columns if ('(Bq/L)' in c) and (tfm.dfs[k][c].dtype == 'object')]
            tfm.dfs[k][cols_rdn] = tfm.dfs[k][cols_rdn].map(self._transform_if_about)

In [None]:
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB()])

tfm()['SEAWATER'].sample(10)

Unnamed: 0,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),132I detection limit (Bq/L),...,125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,TIME,STATION,LON,LAT,org
6421,T-MG1,下層,,,0.0027,,0.0079,,,,...,,,,,,2014/1/7 10:33:00,T-MG1,141.283333,38.333333,TEPCO
6722,T-5,上層,,,0.0021,,0.0052,,,,...,,,,,,2014/2/26 08:46:00,T-5,141.2,37.416667,TEPCO
17234,T-13-1,上層,,,,0.0015,0.011,,,,...,,,,,,2019/12/6 07:53:00,T-13-1,141.0425,37.640833,TEPCO
26653,T-0-1,,,,,,,,,,...,,,,,,2018/10/29 06:36:00,T-0-1,141.040278,37.430556,TEPCO
3070,T-D1,上層,,,,,,,,,...,,,,,,2012/4/10 09:20:00,T-D1,141.072222,37.5,TEPCO
29812,T-0-2,,,,,,,,,,...,,,,,,2022/02/21 06:36:00,T-0-2,141.046667,37.423333,TEPCO
14787,T-D9,下層,,,,0.001,0.0065,,,,...,,,,,,2018/6/25 08:38:00,T-D9,141.072167,37.333333,TEPCO
7293,T-MG6,下層,,,0.0016,,0.0083,,,,...,,,,,,2014/6/3 10:05:00,T-MG6,141.0,38.083333,TEPCO
13967,T-D9,上層,,,,0.0012,0.007,,,,...,,,,,,2018/1/5 08:57:00,T-D9,141.072167,37.333333,TEPCO
14382,T-5,上層,,,,,,,,,...,,,,,,2018/4/2 07:24:00,T-5,141.2,37.416667,TEPCO


## Fix values range string

:::{.callout-important}
## FEEDBACK TO DATA PROVIDER

Value ranges are provided as strings (e.g '4.0E+00<&<8.0E+00' or '1.0～2.7'). We replace them by their mean. Please confirm that this is the correct way to handle this. Again, mentioning uncertainty would be less ambiguous in future.

:::

In [None]:
#| exports
class FixRangeValueStringCB(Callback):
    "Replace range values (e.g '4.0E+00<&<8.0E+00' or '1.0～2.7') by their mean"
    
    def _extract_and_calculate_mean(self, s):
        # For scientific notation ranges
        float_strings = re.findall(r"[+-]?\d+\.?\d*E?[+-]?\d*", s)
        if float_strings:
            float_numbers = np.array(float_strings, dtype=float)
            return float_numbers.mean()
        return s
    
    def _transform_if_range(self, value):
        if pd.isna(value): 
            return value
        value = str(value)
        # Check for both range patterns
        if '<&<' in value or '～' in value:
            return self._extract_and_calculate_mean(value)
        return value

    def __call__(self, tfm): 
        for k in tfm.dfs.keys():
            cols_rdn = [c for c in tfm.dfs[k].columns 
                       if ('(Bq/L)' in c) and (tfm.dfs[k][c].dtype == 'object')]
            tfm.dfs[k][cols_rdn] = tfm.dfs[k][cols_rdn].map(self._transform_if_range).astype(float)

In [None]:
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB(),
    FixRangeValueStringCB()
    ])

df_test = tfm()['SEAWATER']
df_test.sample(10)

Unnamed: 0,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),132I detection limit (Bq/L),...,125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,TIME,STATION,LON,LAT,org
14187,T-18,下層,,,,0.0011,0.0033,,,,...,,,,,,2018/2/16 09:46:00,T-18,140.922222,36.905556,TEPCO
39947,T-2,上層,,0.54,,1.3,,1.6,,,...,,1.9,,,,2012/03/30 08:30:00,T-2,141.033611,37.415833,TEPCO
20057,T-3,上層,,,0.0011,,0.035,,,,...,,,,,,2021/8/17 10:00:00,T-3,141.026389,37.322222,TEPCO
37114,T-1,上層,,,,0.69,,0.56,,,...,,,,,,2020/03/22 07:40:00,T-1,141.034444,37.431111,TEPCO
28551,T-0-1A,,,,,,,,,,...,,,,,,2023/08/25 07:10:00,T-0-1A,141.046667,37.430556,TEPCO
43219,T-2,上層,,,,0.64,,0.58,,,...,,,,,,2022/08/23 09:10:00,T-2,141.033611,37.415833,TEPCO
29136,T-0-2,,,,,,,,,,...,,,,,,2015/08/17 08:57:00,T-0-2,141.046667,37.423333,TEPCO
798,T-A,下層,,8.0,,15.0,,16.0,,,...,,,,,,2011/7/12 07:35:00,T-A,140.763889,36.713889,TEPCO
9176,T-MG3,中層,,,,0.0013,0.0013,,,,...,,,,,,2015/5/7 09:29:00,T-MG3,141.583333,38.233333,TEPCO
13884,T-B,下層,,,,0.97,,1.2,,,...,,,,,,2017/12/13 07:52:00,T-B,140.665556,36.506389,TEPCO


## Select columns of interest

We select the columns of interest and in particular the elements of interest, in our case radionuclides.

In [None]:
#| exports
common_coi = ['LON', 'LAT', 'TIME', 'STATION']
nuclides_pattern = '(Bq/L)'

In [None]:
#| exports
class SelectColsOfInterestCB(Callback):
    "Select columns of interest."
    def __init__(self, common_coi, nuclides_pattern): fc.store_attr()
    def __call__(self, tfm):
        nuc_of_interest = [c for c in tfm.dfs['SEAWATER'].columns if nuclides_pattern in c]
        tfm.dfs['SEAWATER'] = tfm.dfs['SEAWATER'][self.common_coi + nuc_of_interest]

In [None]:
#| eval: false
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB(),
    FixRangeValueStringCB(),
    SelectColsOfInterestCB(common_coi, nuclides_pattern)
    ])

df_test = tfm()['SEAWATER'] 
df_test.sample(5)

Unnamed: 0,LON,LAT,TIME,STATION,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),...,144Ce radioactivity concentration (Bq/L),144Ce detection limit (Bq/L),54Mn radioactivity concentration (Bq/L),54Mn detection limit (Bq/L),3H radioactivity concentration (Bq/L),3H detection limit (Bq/L),125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L)
23682,141.2,37.233333,2023/9/28 07:20:00,T-7,,,,0.0014,0.0015,,...,,,,,,,,,,
12020,141.072222,37.416667,2016/11/8 07:46:00,T-D5,,,0.0027,,0.015,,...,,,,,,,,,,
30402,141.040278,37.416111,2014/03/04 10:42:00,T-0-3,,,,0.73,,0.62,...,,,,,,,,,,
16929,141.0425,37.640833,2019/10/3 06:21:00,T-13-1,,,,0.0014,0.0037,,...,,,,,,,,,,
3495,141.042222,37.584722,2012/7/3 05:33:00,T-S1,,,0.17,,0.24,,...,,,,,,,,,,


In [None]:
#### Debugging

In [None]:
[o for o in df_test.columns if 'beta' in o or 'alpha' in o]

['Total alpha radioactivity concentration (Bq/L)',
 'Total alpha detection limit (Bq/L)',
 'Total beta radioactivity concentration (Bq/L)',
 'Total beta detection limit (Bq/L)']

In [None]:
df_test.melt(id_vars=['LON', 'LAT', 'TIME', 'STATION']).head()

Unnamed: 0,LON,LAT,TIME,STATION,variable,value
0,141.026389,37.322222,2011/3/21 23:15:00,T-3,131I radioactivity concentration (Bq/L),1100.0
1,141.013889,37.241667,2011/3/21 23:45:00,T-4,131I radioactivity concentration (Bq/L),660.0
2,141.026389,37.322222,2011/3/22 14:28:00,T-3,131I radioactivity concentration (Bq/L),1100.0
3,141.013889,37.241667,2011/3/22 15:06:00,T-4,131I radioactivity concentration (Bq/L),670.0
4,141.026389,37.322222,2011/3/23 13:51:00,T-3,131I radioactivity concentration (Bq/L),740.0


In [None]:
df_tmp = df_test.melt(id_vars=['LON', 'LAT', 'TIME', 'STATION'])
df_tmp.sample(10)

Unnamed: 0,LON,LAT,TIME,STATION,variable,value
2132860,141.033611,37.415833,2019/10/07 07:00:00,T-2,105Ru detection limit (Bq/L),
1449050,141.0825,37.428611,2023/6/29 10:08:00,T-S4,106Ru detection limit (Bq/L),
1576961,141.078889,37.383333,2015/1/25 07:01:00,T-S8,60Co radioactivity concentration (Bq/L),
49580,141.2,37.233333,2011/12/7 08:10:00,T-7,131I detection limit (Bq/L),0.67
1200897,141.666667,38.3,2017/4/17 08:25:00,T-MG2,Total beta radioactivity concentration (Bq/L),
1915322,140.837222,35.796111,2018/3/13 13:31:00,T-E,3H detection limit (Bq/L),
595162,141.2,37.416667,2024/5/16 07:23:00,T-5,140La radioactivity concentration (Bq/L),
1181024,141.033611,37.415833,2017/01/31 07:10:00,T-2,Total alpha detection limit (Bq/L),
2021492,141.026389,37.322222,2024/8/13 09:25:00,T-3,125Sb detection limit (Bq/L),
2123947,141.046667,37.416111,2023/12/04 07:46:00,T-0-3A,105Ru detection limit (Bq/L),


In [None]:
df_tmp.variable.apply(lambda x: x.split(' '))

0          [131I, radioactivity, concentration, (Bq/L)]
1          [131I, radioactivity, concentration, (Bq/L)]
2          [131I, radioactivity, concentration, (Bq/L)]
3          [131I, radioactivity, concentration, (Bq/L)]
4          [131I, radioactivity, concentration, (Bq/L)]
                               ...                     
2138665               [105Ru, detection, limit, (Bq/L)]
2138666               [105Ru, detection, limit, (Bq/L)]
2138667               [105Ru, detection, limit, (Bq/L)]
2138668               [105Ru, detection, limit, (Bq/L)]
2138669               [105Ru, detection, limit, (Bq/L)]
Name: variable, Length: 2138670, dtype: object

In [None]:
def extract_nuclide(text):
        words = text.split(' ')
        # Handle special cases for alpha/beta
        if len(words) >= 2 and words[1].lower() in ['alpha', 'beta']:
            return f"{words[0]} {words[1]}"
        return words[0]

df_tmp['variable'].map(extract_nuclide).unique()

array(['131I', '134Cs', '137Cs', '132I', '132Te', '136Cs', '140La',
       '89Sr', '90Sr', '238Pu', '239Pu+240Pu', 'Total alpha',
       'Total beta', '140Ba', '106Ru', '58Co', '60Co', '144Ce', '54Mn',
       '3H', '125Sb', '105Ru'], dtype=object)

In [None]:
df_tmp['NUCLIDE'] = df_tmp['variable'].map(extract_nuclide)
df_tmp.sample(10)

Unnamed: 0,LON,LAT,TIME,STATION,variable,value,NUCLIDE
1850522,37.41,141.03,2013/01/06 07:35:00,T-2-1,54Mn detection limit (Bq/L),,54Mn
439722,141.072167,37.333333,2016/11/1 09:45:00,T-D9,132Te detection limit (Bq/L),,132Te
231288,141.033611,37.415833,2018/09/29 07:08:00,T-2,137Cs radioactivity concentration (Bq/L),,137Cs
1723040,141.013889,37.241667,2016/11/29 11:00:00,T-4,144Ce detection limit (Bq/L),,144Ce
1053663,141.072167,37.333333,2014/10/25 07:50:00,T-D9,239Pu+240Pu detection limit (Bq/L),,239Pu+240Pu
2002734,141.026389,37.322222,2014/2/18 10:40:00,T-3,125Sb detection limit (Bq/L),,125Sb
904296,141.047222,37.241667,2011/8/30 07:00:00,T-11,238Pu detection limit (Bq/L),,238Pu
1361899,141.040278,37.416111,2021/07/19 06:30:00,T-0-3,140Ba detection limit (Bq/L),,140Ba
191478,141.2,37.416667,2011/9/12 08:15:00,T-5,137Cs radioactivity concentration (Bq/L),,137Cs
1982211,141.034444,37.431111,2012/10/14 08:20:00,T-1,125Sb radioactivity concentration (Bq/L),,125Sb


In [None]:
df_tmp['is_concentration'] = df_tmp['variable'].str.contains('radioactivity concentration')

In [None]:
df_tmp['is_dl'] = df_tmp['variable'].str.contains('detection limit')

In [None]:
df_tmp['is_unc'] = df_tmp['variable'].str.contains('statistical error')

In [None]:
df_tmp.sample(10)

Unnamed: 0,LON,LAT,TIME,STATION,variable,value,NUCLIDE,is_concentration,is_dl,is_unc
863917,141.047222,37.241667,2014/12/28 10:26:00,T-11,238Pu radioactivity concentration (Bq/L),,238Pu,True,False,False
2066557,141.072222,37.416667,2023/4/18 08:16:00,T-D5,105Ru radioactivity concentration (Bq/L),,105Ru,True,False,False
969527,141.072167,37.333333,2020/12/21 09:30:00,T-D9,239Pu+240Pu radioactivity concentration (Bq/L),,239Pu+240Pu,True,False,False
1165099,141.072222,37.416667,2024/3/11 10:18:00,T-D5,Total alpha detection limit (Bq/L),,Total alpha,False,True,False
278912,141.033611,37.415833,2018/12/03 08:05:00,T-2,137Cs detection limit (Bq/L),,137Cs,False,True,False
1515106,141.033611,37.415833,2019/12/02 07:10:00,T-2,58Co radioactivity concentration (Bq/L),,58Co,True,False,False
1170076,141.046667,37.423333,2018/08/28 07:20:00,T-0-2,Total alpha detection limit (Bq/L),,Total alpha,False,True,False
1493639,141.0,38.083333,2021/10/7 11:10:00,T-MG6,58Co radioactivity concentration (Bq/L),,58Co,True,False,False
1001310,140.922222,36.905556,2012/5/21 05:15:00,T-18,239Pu+240Pu statistical error (Bq/L),,239Pu+240Pu,False,False,True
1659828,141.033611,37.415833,2024/01/07 06:20:00,T-2,60Co detection limit (Bq/L),,60Co,False,True,False


In [None]:
df_tmp['UNIT'] = df_tmp['variable'].str.extract(r'\((.*?)\)')
df_tmp.sample(5)

Unnamed: 0,LON,LAT,TIME,STATION,variable,value,NUCLIDE,is_concentration,is_dl,is_unc,UNIT
1036130,141.034444,37.431111,2022/03/31 07:22:00,T-1,239Pu+240Pu statistical error (Bq/L),,239Pu+240Pu,False,False,True,Bq/L
1976477,141.046667,37.430556,2017/07/03 07:43:00,T-0-1A,125Sb radioactivity concentration (Bq/L),,125Sb,True,False,False,Bq/L
842871,141.034444,37.431111,2015/08/11 06:15:00,T-1,90Sr detection limit (Bq/L),,90Sr,False,True,False,Bq/L
780920,141.072167,37.333333,2021/11/15 08:34:00,T-D9,90Sr radioactivity concentration (Bq/L),,90Sr,True,False,False,Bq/L
41094,141.033611,37.415833,2018/07/23 07:55:00,T-2,131I radioactivity concentration (Bq/L),,131I,True,False,False,Bq/L


In [None]:
# Check rows where all three columns are False
mask = ~(df_tmp['is_concentration'] | df_tmp['is_dl'] | df_tmp['is_unc'])
df_tmp[mask].sample(min(10, mask.sum())) if mask.any() else "No rows where all columns are False"

'No rows where all columns are False'

In [None]:
def _type_column(self, df):
        "Create type column."
        conditions = [
            df['is_concentration'],
            df['is_dl'],
            df['is_unc']
        ]
        choices = ['VALUE', 'DL', 'UNC']
        df['type'] = np.select(conditions, choices)
        df = df.drop(['is_concentration', 'is_dl', 'is_unc'], axis=1)
        return df

In [None]:
conditions = [
            df_tmp['is_concentration'],
            df_tmp['is_dl'],
            df_tmp['is_unc']
        ]

In [None]:
choices = ['VALUE', 'DL', 'UNC']
df_tmp['type'] = np.select(conditions, choices, default='Not Available')

In [None]:
df_tmp['type'].unique()

array(['VALUE', 'DL', 'UNC'], dtype=object)

## Reshape: wide to long

This step is necessary to extract information such as nuclide names,  detection limit, uncertainty, ...


In [None]:
#| exports
class WideToLongCB(Callback):
    """
    Parse TEPCO measurement columns to extract nuclide name, measurement value, 
    detection limit and uncertainty
    """
    def __init__(self): fc.store_attr()
    
    
    def _melt(self, df):
        "Melt dataframe to long format."
        return df.melt(id_vars=['LON', 'LAT', 'TIME', 'STATION'])
        
    def _extract_nuclide(self, text):
        words = text.split(' ')
        # Handle special cases for alpha/beta
        if len(words) >= 2 and words[1].lower() in ['alpha', 'beta']:
            return f"{words[0]} {words[1]}"
        return words[0]
    
    def _nuclide_name(self, df):
        "Extract nuclide name from nuclide names."
        df['NUCLIDE'] = df['variable'].map(self._extract_nuclide)
        return df
    
    def _type_indicator(self, df):
        "Create type indicators."
        df['is_concentration'] = df['variable'].str.contains('radioactivity concentration')
        df['is_dl'] = df['variable'].str.contains('detection limit')
        df['is_unc'] = df['variable'].str.contains('statistical error')
        return df
    
    def _unit(self, df):
        "Extract unit from nuclide names."
        df['UNIT'] = df['variable'].str.extract(r'\((.*?)\)')
        return df
    
    def _type_column(self, df):
        "Create type column."
        conditions = [
            df['is_concentration'],
            df['is_dl'],
            df['is_unc']
        ]
        choices = ['VALUE', 'DL', 'UNC']
        df['type'] = np.select(conditions, choices, default='Not Available')
        df = df.drop(['is_concentration', 'is_dl', 'is_unc'], axis=1)
        return df
    
    def __call__(self, tfm):
        tfm.dfs['SEAWATER'] = self._melt(tfm.dfs['SEAWATER'])
        tfm.dfs['SEAWATER'] = self._nuclide_name(tfm.dfs['SEAWATER'])
        tfm.dfs['SEAWATER'] = self._type_indicator(tfm.dfs['SEAWATER'])
        tfm.dfs['SEAWATER'] = self._unit(tfm.dfs['SEAWATER'])
        tfm.dfs['SEAWATER'] = self._type_column(tfm.dfs['SEAWATER'])
        tfm.dfs['SEAWATER'] = pd.pivot_table(
            tfm.dfs['SEAWATER'],
            values='value',
            index=['LON', 'LAT', 'TIME', 'STATION', 'NUCLIDE', 'UNIT'],
            columns='type',
            aggfunc='first'
        ).reset_index()
        # reset the index and rename it ID
        tfm.dfs['SEAWATER'].reset_index(inplace=True)
        tfm.dfs['SEAWATER'].rename(columns={'index': 'ID'}, inplace=True)
        

In [None]:
#| eval: false
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB(),
    FixRangeValueStringCB(),
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB()
    ])

df_test = tfm()['SEAWATER'] 
df_test.head()

type,ID,LON,LAT,TIME,STATION,NUCLIDE,UNIT,DL,UNC,VALUE
0,0,37.21,141.01,2012/10/16 07:25:00,T-4-1,131I,Bq/L,0.13,,
1,1,37.21,141.01,2012/10/16 07:25:00,T-4-1,134Cs,Bq/L,0.19,,
2,2,37.21,141.01,2012/10/16 07:25:00,T-4-1,137Cs,Bq/L,0.27,,
3,3,37.21,141.01,2012/10/2 07:30:00,T-4-1,131I,Bq/L,0.11,,
4,4,37.21,141.01,2012/10/2 07:30:00,T-4-1,134Cs,Bq/L,0.22,,


## Remap `UNIT` name to MARIS nomenclature

In [None]:
#| exports
unit_mapping = {'Bq/L': 3}

In [None]:
#| exports
class RemapUnitNameCB(Callback):
    """
    Remap `UNIT` name to MARIS id.
    """
    def __init__(self, unit_mapping): fc.store_attr()
    def __call__(self, tfm):
        tfm.dfs['SEAWATER']['UNIT'] = tfm.dfs['SEAWATER']['UNIT'].map(self.unit_mapping)


In [None]:
#| eval: false
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB(),
    FixRangeValueStringCB(),
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(),
    RemapUnitNameCB(unit_mapping)
    ])

df_test = tfm()['SEAWATER'] 
df_test.sample(5)

type,ID,LON,LAT,TIME,STATION,NUCLIDE,UNIT,DL,UNC,VALUE
84463,84463,141.283333,38.333333,2011/10/11 11:11:00,T-MG1,134Cs,3,6.0,,
63230,63230,141.046667,37.430556,2024/01/29 07:51:00,T-0-1A,Total beta,3,13.0,,
42154,42154,141.034444,37.431111,2019/01/21 07:30:00,T-1,134Cs,3,0.62,,0.0025
45478,45478,141.034444,37.431111,2022/11/20 07:50:00,T-1,134Cs,3,0.6,,
87444,87444,141.583333,38.633333,2019/8/6 10:30:00,T-MG0,134Cs,3,0.0014,,


## Remap `NUCLIDE` name to MARIS nomenclature

In [None]:
#| exports
nuclide_mapping = {
    '131I': 29,
    '134Cs': 31,
    '137Cs': 33,
    '125Sb': 24,
    'Total beta': 103,
    '238Pu': 67,
    '239Pu+240Pu': 77,
    '3H': 1,
    '89Sr': 11,
    '90Sr': 12,
    'Total alpha': 104,
    '132I': 100,
    '136Cs': 102,
    '58Co': 8,
    '105Ru': 97,
    '106Ru': 17,
    '140La': 35,
    '140Ba': 34,
    '132Te': 99,
    '60Co': 9,
    '144Ce': 37,
    '54Mn': 6
}

In [None]:
#| exports
class RemapNuclideNameCB(Callback):
    """
    Remap `NUCLIDE` name to MARIS id.
    """
    def __init__(self, nuclide_mapping): fc.store_attr()
    def __call__(self, tfm):
        tfm.dfs['SEAWATER']['NUCLIDE'] = tfm.dfs['SEAWATER']['NUCLIDE'].map(self.nuclide_mapping)

In [None]:
#| eval: false
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB(),
    FixRangeValueStringCB(),
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(),
    RemapUnitNameCB(unit_mapping),
    RemapNuclideNameCB(nuclide_mapping)
    ])

df_test = tfm()['SEAWATER'] 
df_test.sample(5)

type,ID,LON,LAT,TIME,STATION,NUCLIDE,UNIT,DL,UNC,VALUE
69447,69447,141.072167,37.333333,2015/2/23 09:25:00,T-D9,33,3,,,0.01
77944,77944,141.133333,38.25,2012/6/27 08:45:00,T-MG4,31,3,,,0.019
44309,44309,141.034444,37.431111,2021/07/05 08:35:00,T-1,33,3,0.55,,0.13
47450,47450,141.034444,37.431111,2025/01/16 07:28:00,T-1,31,3,0.88,,
73505,73505,141.072222,37.5,2012/11/9 08:35:00,T-D1,104,3,3.2,,


## Remap `DL` value to MARIS nomenclature

We remap `DL` (Detection Limit) value to MARIS ids as follows:
    
- if a `DL` value is reported with assign `2` (Detection limit or '<')
- if a `DL` value is not reported with assign `1` (Detected value or '=')

In [None]:
#| exports
class RemapDLCB(Callback):
    """
    Remap `DL` name to MARIS id.
    """
    def __init__(self): fc.store_attr()
    def dl_mapping(self, value): return 1 if pd.isna(value) else 2
    def __call__(self, tfm): 
        tfm.dfs['SEAWATER']['DL'] = tfm.dfs['SEAWATER']['DL'].map(self.dl_mapping)

In [None]:
#| eval: false
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB(),
    FixRangeValueStringCB(),
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(),
    RemapUnitNameCB(unit_mapping),
    RemapNuclideNameCB(nuclide_mapping),
    RemapDLCB()
    ])

df_test = tfm()['SEAWATER'] 
df_test.sample(10)

type,ID,LON,LAT,TIME,STATION,NUCLIDE,UNIT,DL,UNC,VALUE
39497,39497,141.034444,37.431111,2016/10/25 06:55,T-1,31,3,2,,
41734,41734,141.034444,37.431111,2018/09/17 07:00:00,T-1,33,3,2,,0.067
74260,74260,141.072222,37.5,2016/5/2 08:15:00,T-D1,103,3,2,,
63421,63421,141.046667,37.430556,2024/08/29 07:09:00,T-0-1A,1,3,2,,
46475,46475,141.034444,37.431111,2023/12/11 07:25:00,T-1,1,3,2,,0.15
50885,50885,141.040278,37.430556,2013/09/11 09:16:00,T-0-1,103,3,2,,
52126,52126,141.040278,37.430556,2019/09/25 07:15:00,T-0-1,1,3,2,,
45713,45713,141.034444,37.431111,2023/03/02 07:35:00,T-1,33,3,2,,
72854,72854,141.072222,37.416667,2021/7/19 08:03:00,T-D5,1,3,2,,
43914,43914,141.034444,37.431111,2021/01/16 07:55:00,T-1,31,3,2,,


## Parse & encode time

In [None]:
#| exports
class ParseTimeCB(Callback):
    "Parse time column from TEPCO."
    def __init__(self,
                 time_name='TIME'):
        fc.store_attr()
        
    def __call__(self, tfm):
        tfm.dfs['SEAWATER'][self.time_name] = pd.to_datetime(tfm.dfs['SEAWATER'][self.time_name], 
                                                             format='%Y/%m/%d %H:%M:%S', errors='coerce')

In [None]:
#| eval: false
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB(),
    FixRangeValueStringCB(),
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(),
    RemapUnitNameCB(unit_mapping),
    RemapNuclideNameCB(nuclide_mapping),
    RemapDLCB(),
    ParseTimeCB(),
    EncodeTimeCB()
    ])

df_test = tfm()['SEAWATER'] 
df_test.sample(5)




type,ID,LON,LAT,TIME,STATION,NUCLIDE,UNIT,DL,UNC,VALUE
51052,51052,141.040278,37.430556,1404900240,T-0-1,31,3,2,,
10065,10065,140.837222,35.796111,1366120740,T-E,31,3,2,,
86451,86451,141.583333,38.233333,1549444920,T-MG3,33,3,1,,0.0027
59250,59250,141.046667,37.423333,1500882600,T-0-2,103,3,2,,
18339,18339,141.026389,37.322222,1630407000,T-3,31,3,2,,


## Sanitize coordinates

In [None]:
#| eval: false
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB(),
    FixRangeValueStringCB(),
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(),
    RemapUnitNameCB(unit_mapping),
    RemapNuclideNameCB(nuclide_mapping),
    RemapDLCB(),
    ParseTimeCB(),
    EncodeTimeCB(),
    SanitizeLonLatCB()
    ])

df_test = tfm()['SEAWATER']
df_test.sample(5)



type,ID,LON,LAT,TIME,STATION,NUCLIDE,UNIT,DL,UNC,VALUE
86173,86173,141.583333,38.233333,1446628800,T-MG3,33,3,1,,0.0034
67582,67582,141.0625,37.552778,1346738340,T-S2,33,3,1,,0.025
48133,48133,141.039444,37.265,1499059260,T-S5,31,3,2,,0.0021
73125,73125,141.072222,37.416667,1703492520,T-D5,31,3,2,,
29346,29346,141.033611,37.415833,1655802300,T-2,103,3,1,,9.3


In [None]:
# df_test.shape

## Encode to NetCDF

In [None]:
#| eval: false
tfm = Transformer(dfs, cbs=[
    FixMissingValuesCB(),
    RemoveJapanaseCharCB(),
    FixRangeValueStringCB(),
    SelectColsOfInterestCB(common_coi, nuclides_pattern),
    WideToLongCB(),
    RemapUnitNameCB(unit_mapping),
    RemapNuclideNameCB(nuclide_mapping),
    RemapDLCB(),
    ParseTimeCB(),
    EncodeTimeCB(),
    SanitizeLonLatCB()
    ])

dfs_tfm = tfm()
tfm.logs



['Assign `NaN` to values equal to `ND` (not detected) - to be confirmed ',
 'Remove 約 (about) char',
 "Replace range values (e.g '4.0E+00<&<8.0E+00' or '1.0～2.7') by their mean",
 'Select columns of interest.',
 '\n    Parse TEPCO measurement columns to extract nuclide name, measurement value, \n    detection limit and uncertainty\n    ',
 '\n    Remap `UNIT` name to MARIS id.\n    ',
 '\n    Remap `NUCLIDE` name to MARIS id.\n    ',
 '\n    Remap `DL` name to MARIS id.\n    ',
 'Parse time column from TEPCO.',
 'Encode time as seconds since epoch.',
 'Drop rows with invalid longitude & latitude values. Convert `,` separator to `.` separator.']

In [None]:
dfs_tfm['SEAWATER'].sample(10)

type,ID,LON,LAT,TIME,STATION,NUCLIDE,UNIT,DL,UNC,VALUE
84511,84511,141.283333,38.333333,1323255360,T-MG1,31,3,2,,
24010,24010,141.033611,37.415833,1524293700,T-2,31,3,2,,
41490,41490,141.034444,37.431111,1530691200,T-1,31,3,2,,
24960,24960,141.033611,37.415833,1543735200,T-2,33,3,2,,
63243,63243,141.046667,37.430556,1709278800,T-0-1A,1,3,2,,
45249,45249,141.034444,37.431111,1660464000,T-1,33,3,2,,
24776,24776,141.033611,37.415833,1540105200,T-2,29,3,2,,
9964,9964,140.837222,35.796111,1331560140,T-E,33,3,2,,
77864,77864,141.133333,38.25,1325669580,T-MG4,33,3,2,,
25236,25236,141.033611,37.415833,1549263300,T-2,103,3,1,,14.0


In [None]:
#| exports
kw = ['oceanography', 'Earth Science > Oceans > Ocean Chemistry> Radionuclides',
      'Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure',
      'Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments',
      'Earth Science > Oceans > Ocean Chemistry, Earth Science > Oceans > Sea Ice > Isotopes',
      'Earth Science > Oceans > Water Quality > Ocean Contaminants',
      'Earth Science > Biological Classification > Animals/Vertebrates > Fish',
      'Earth Science > Biosphere > Ecosystems > Marine Ecosystems',
      'Earth Science > Biological Classification > Animals/Invertebrates > Mollusks',
      'Earth Science > Biological Classification > Animals/Invertebrates > Arthropods > Crustaceans',
      'Earth Science > Biological Classification > Plants > Macroalgae (Seaweeds)']

In [None]:
#| exports
def get_attrs(tfm, zotero_key, kw=kw):
    "Retrieve global attributes from MARIS dump."
    return GlobAttrsFeeder(tfm.dfs, cbs=[
        BboxCB(),
        TimeRangeCB(),
        ZoteroCB(zotero_key, cfg=cfg()),
        KeyValuePairCB('keywords', ', '.join(kw)),
        KeyValuePairCB('publisher_postprocess_logs', ', '.join(tfm.logs))
        ])()

In [None]:
#| eval: false
get_attrs(tfm, zotero_key='JEV6HP5A', kw=kw)

{'geospatial_lat_min': '141.66666667',
 'geospatial_lat_max': '38.63333333',
 'geospatial_lon_min': '140.60388889',
 'geospatial_lon_max': '35.79611111',
 'geospatial_bounds': 'POLYGON ((140.60388889 35.79611111, 141.66666667 35.79611111, 141.66666667 38.63333333, 140.60388889 38.63333333, 140.60388889 35.79611111))',
 'time_coverage_start': '2011-03-21T14:30:00',
 'time_coverage_end': '2025-01-25T07:24:00',
 'id': 'JEV6HP5A',
 'title': "Readings of Sea Area Monitoring - Monitoring of sea water - Sea area close to TEPCO's Fukushima Daiichi NPS / Coastal area - Readings of Sea Area Monitoring [TEPCO]",
 'summary': '',
 'creator_name': '[{"creatorType": "author", "firstName": "", "lastName": "TEPCO - Tokyo Electric Power Company"}]',
 'keywords': 'oceanography, Earth Science > Oceans > Ocean Chemistry> Radionuclides, Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure, Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Mar

In [None]:
#| exports
def encode(
    fname_out: str, # Path to the folder where the NetCDF output will be saved
    **kwargs # Additional keyword arguments
    ):
    "Encode TEPCO data to NetCDF."
    dfs = load_data(fname_coastal_water, fname_clos1F, fname_iaea_orbs)
    
    tfm = Transformer(dfs, cbs=[
        FixMissingValuesCB(),
        RemoveJapanaseCharCB(),
        FixRangeValueStringCB(),
        SelectColsOfInterestCB(common_coi, nuclides_pattern),
        WideToLongCB(),
        RemapUnitNameCB(unit_mapping),
        RemapNuclideNameCB(nuclide_mapping),
        RemapDLCB(),
        ParseTimeCB(),
        EncodeTimeCB(),
        SanitizeLonLatCB()
    ])        
    tfm()
    encoder = NetCDFEncoder(tfm.dfs, 
                            dest_fname=fname_out, 
                            global_attrs=get_attrs(tfm, zotero_key='JEV6HP5A', kw=kw),
                            verbose=kwargs.get('verbose', False)
                            )
    encoder.encode()

In [None]:
#| eval: false
encode(fname_out, verbose=False)

100%|██████████| 11/11 [00:08<00:00,  1.34it/s]
100%|██████████| 11/11 [00:04<00:00,  2.37it/s]




RuntimeError: NetCDF: Filter error: bad id or parameters or duplicate filter: (variable 'station', group 'seawater')

In [None]:
#| eval: false
decode(fname_in=fname_out, verbose=True)

Saved SEAWATER to ../../_data/output/tepco_SEAWATER.csv
