In [1]:
# Inputs
fname_in = '../../_data/xls/tepco/coastal_water.xlsx'
fname_out = '../../_data/output/tepco.nc'

# TEPCO 
> Data pipeline (handler) to convert TEPCO dataset ([Source](https://radioactivity.nsr.go.jp/ja/list/349/list-1.html)) to `NetCDF` format

## Packages import

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np

from netCDF4 import Dataset
from datetime import datetime, timedelta
from cftime import num2date, date2num

from marisco.utils import has_valid_varname, get_bbox
from marisco.serializers import to_netcdf
from marisco.configs import NC_TPL_PATH
from datetime import datetime
import re

## Configs

In [4]:
CONFIGS = {
    'global_attr': {
        'description': 'TEPCO dataset ...',
        'summary': '...',
        'keyword': 'MARIS TEPCO sediments',
        'license': 'tbd',
    },
    'var_names': {
        'Sampling coordinate North latitude (Decimal)': 'lat',
        'Sampling coordinate East longitude (Decimal)': 'lon',
        '131I radioactivity concentration (Bq/L)': 'i131',
        '131I detection limit (Bq/L)': 'i131_dl',
        '134Cs radioactivity concentration (Bq/L)': 'cs134',
        '134Cs detection limit (Bq/L)': 'cs134_dl',
        '137Cs radioactivity concentration (Bq/L)': 'cs137',
        '137Cs detection limit (Bq/L)': 'cs137_dl',
        '132I radioactivity concentration (Bq/L)': 'i132',
        '132I detection limit (Bq/L)': 'i132_dl',
        '132Te radioactivity concentration (Bq/L)': 'te132',
        '132Te detection limit (Bq/L)': 'te132_dl',
        '136Cs radioactivity concentration (Bq/L)': 'cs136',
        '136Cs detection limit (Bq/L)': 'cs136_dl',
        '140La radioactivity concentration (Bq/L)': 'la140',
        '140La detection limit (Bq/L)': 'la140_dl',
        '89Sr radioactivity concentration (Bq/L)': 'sr89',
        '89Sr detection limit (Bq/L)': 'sr89_dl',
        '90Sr radioactivity concentration (Bq/L)': 'sr90',
        '90Sr detection limit (Bq/L)': 'sr90_dl',
        '238Pu radioactivity concentration (Bq/L)': 'pu238',
        '238Pu detection limit (Bq/L)': 'pu238_dl',
        '239Pu+240Pu radioactivity concentration (Bq/L)': 'pu239_240_tot',
        '239Pu+240Pu statistical error (Bq/L)': 'pu239_240_tot_unc',
        '239Pu+240Pu detection limit (Bq/L)': 'pu239_240_tot_dl',
        'Total alpha radioactivity concentration (Bq/L)': 'talpha',
        'Total alpha detection limit (Bq/L)': 'talpha_dl',
        'Total beta radioactivity concentration (Bq/L)': 'tbeta',
        'Total beta detection limit (Bq/L)': 'tbeta_dl',
        '140Ba radioactivity concentration (Bq/L)': 'ba140',
        '140Ba detection limit (Bq/L)': 'ba140_dl',
        '106Ru radioactivity concentration (Bq/L)': 'ru106',
        '106Ru detection limit (Bq/L)': 'ru106_dl',
        '58Co radioactivity concentration (Bq/L)': 'co58',
        '58Co detection limit (Bq/L)': 'co58_dl',
        '60Co radioactivity concentration (Bq/L)': 'co60',
        '60Co detection limit (Bq/L)': 'co60_dl',
        '144Ce radioactivity concentration (Bq/L)': 'ce144',
        '144Ce detection limit (Bq/L)': 'ce144_dl',
        '54Mn radioactivity concentration (Bq/L)': 'mn54',
        '54Mn detection limit (Bq/L)': 'mn54_dl',
        '3H radioactivity concentration (Bq/L)': 'h3',
        '3H detection limit (Bq/L)': 'h3_dl', 
        '125Sb radioactivity concentration (Bq/L)': 'sb125',
        '125Sb detection limit (Bq/L)': 'sb125_dl',
        '105Ru radioactivity concentration (Bq/L)': 'ru105',
        '105Ru detection limit (Bq/L)': 'ru105_dl'
    }    
}

## Load tables

The data is provided as a single `.xls` file. A preview of this file, e.g using [Open Office](http://www.openoffice.org/), indicates the presence of two datasets in a single sheet: the measurements and their locations that can be joined using the `Sampling point number` column.

### Measurements

In [5]:
df = pd.read_excel(fname_in, skiprows=1, nrows=23643,
                   converters={'Sampling time': lambda x: x if x != '' else '00:00:00'}); df.head()

Unnamed: 0,Sampling date,Sampling time,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),...,144Ce detection limit (Bq/L),54Mn radioactivity concentration (Bq/L),54Mn detection limit (Bq/L),3H radioactivity concentration (Bq/L),3H detection limit (Bq/L),125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49
0,2011-03-21,23:15:00,T-3,,1100,13,48,9.2,53,8.8,...,,,,,,,,,,
1,2011-03-21,23:45:00,T-4,,660,12,31,8.7,33,8.3,...,,,,,,,,,,
2,2011-03-22,14:28:00,T-3,,1100,20,46,14.0,40,14.0,...,,,,,,,,,,
3,2011-03-22,15:06:00,T-4,,670,19,39,11.0,44,11.0,...,,,,,,,,,,
4,2011-03-23,13:51:00,T-3,,740,27,51,20.0,55,20.0,...,,,,,,,,34.0,25.0,


### `Location` data

In [6]:
df_loc = pd.read_excel(fname_in, skiprows=23647, nrows=48, usecols=[0,1,2]); df_loc.head()

Unnamed: 0,Sampling point number,Sampling coordinate North latitude (Decimal),Sampling coordinate East longitude (Decimal)
0,T-0,37.42,141.04
1,T-11,37.24,141.05
2,T-12,37.15,141.04
3,T-13-1,37.64,141.04
4,T-14,37.55,141.06


### Data join

In [7]:
# Join
df = pd.merge(df, df_loc, how="inner", on='Sampling point number')
df.drop(columns=['Sampling point number'], inplace=True)
df.index.name = 'sample'; df.head()

Unnamed: 0_level_0,Sampling date,Sampling time,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),...,54Mn detection limit (Bq/L),3H radioactivity concentration (Bq/L),3H detection limit (Bq/L),125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,Sampling coordinate North latitude (Decimal),Sampling coordinate East longitude (Decimal)
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2011-03-21,23:15:00,,1100,13,48,9.2,53,8.8,160,...,,,,,,,,,37.32,141.03
1,2011-03-22,14:28:00,,1100,20,46,14.0,40,14.0,ND,...,,,,,,,,,37.32,141.03
2,2011-03-23,13:51:00,,740,27,51,20.0,55,20.0,200,...,,,,,,34.0,25.0,,37.32,141.03
3,2011-03-24,09:30:00,,1100,52,99,38.0,94,41.0,120,...,,,,,,56.0,44.0,,37.32,141.03
4,2011-03-25,10:00:00,,430,10,26,7.4,34,5.9,58,...,,,,,,,,,37.32,141.03


## Data preparation

### Address missing values

In [8]:
# Not detected. What should we do? 
# We have the detected field
df[df == 'ND'] = np.nan

### Parse time

In [9]:
def get_datetime(col):
    day = str(col[0].date())
    time = str(col[1])
    return datetime.strptime(day + ' ' + time, '%Y-%m-%d %H:%M:%S')

time_cols = ['Sampling date', 'Sampling time']
df['time'] = df[time_cols].apply(get_datetime, axis=1)
df.drop(columns=time_cols)

Unnamed: 0_level_0,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),132I detection limit (Bq/L),132Te radioactivity concentration (Bq/L),...,3H radioactivity concentration (Bq/L),3H detection limit (Bq/L),125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,Sampling coordinate North latitude (Decimal),Sampling coordinate East longitude (Decimal),time
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,1100,13,48,9.2,53,8.8,160,44.0,,...,,,,,,,,37.32,141.03,2011-03-21 23:15:00
1,,1100,20,46,14,40,14,,88.0,,...,,,,,,,,37.32,141.03,2011-03-22 14:28:00
2,,740,27,51,20,55,20,200,58.0,,...,,,,,34.0,25.0,,37.32,141.03,2011-03-23 13:51:00
3,,1100,52,99,38,94,41,120,88.0,,...,,,,,56.0,44.0,,37.32,141.03,2011-03-24 09:30:00
4,,430,10,26,7.4,34,5.9,58,22.0,13.0,...,,,,,,,,37.32,141.03,2011-03-25 10:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21472,上層,,,,0.0011,0.011,,,,,...,,,,,,,,37.48,141.04,2022-07-12 10:10:00
21473,上層,,,,,,,,,,...,,0.37,,,,,,37.48,141.04,2022-07-12 10:10:00
21474,上層,,,,0.0013,0.01,,,,,...,,,,,,,,37.48,141.04,2022-07-19 10:00:00
21475,上層,,,,,,,,,,...,,0.38,,,,,,37.48,141.04,2022-07-19 10:00:00


In [10]:
# Encoding time as seconds since ...
format_time = lambda x: date2num(x, units="seconds since 1970-01-01 00:00:00.0")
df['time'] = df['time'].apply(format_time)

### Rename columns

In [11]:
has_valid_varname(CONFIGS['var_names'], NC_TPL_PATH)
df.rename(columns=CONFIGS['var_names'], inplace=True)

In [12]:
df.head()

Unnamed: 0_level_0,Sampling date,Sampling time,Collection layer of seawater,i131,i131_dl,cs134,cs134_dl,cs137,cs137_dl,i132,...,h3,h3_dl,sb125,sb125_dl,ru105,ru105_dl,Unnamed: 49,lat,lon,time
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2011-03-21,23:15:00,,1100,13,48,9.2,53,8.8,160.0,...,,,,,,,,37.32,141.03,1300749300
1,2011-03-22,14:28:00,,1100,20,46,14.0,40,14.0,,...,,,,,,,,37.32,141.03,1300804080
2,2011-03-23,13:51:00,,740,27,51,20.0,55,20.0,200.0,...,,,,,34.0,25.0,,37.32,141.03,1300888260
3,2011-03-24,09:30:00,,1100,52,99,38.0,94,41.0,120.0,...,,,,,56.0,44.0,,37.32,141.03,1300959000
4,2011-03-25,10:00:00,,430,10,26,7.4,34,5.9,58.0,...,,,,,,,,37.32,141.03,1301047200


In [13]:
# To get the bbox (when needed)
get_bbox(df)

(140.6, 35.8, 141.67, 38.63)

## Encoding

### To NetCDF

In [14]:
dfs = {'sediment': df}

def units_fn(grp_name, rdn_name): 
    return 'Bq/l'

to_netcdf(dfs, NC_TPL_PATH, fname_out, CONFIGS, units_fn)

% of discarded data for grp sediment: 6.429371649410898


### To csv
TBD ...