# TEPCO dataset
> Importing, cleaning and transforming a TEPCO dataset ([Source](https://radioactivity.nsr.go.jp/ja/list/349/list-1.html)).

## Packages import

In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
import pandas as pd
import numpy as np

from netCDF4 import Dataset
from datetime import datetime, timedelta
from cftime import num2date, date2num

from datetime import datetime
import re

## Discussion points

- Lionel approach
- TEPCO pipeline overview
- From MARIS template CDL -> `.nc`
    - overview of MARIS CDL
        - seawater, sediment, biota, suspended-matter groups (to handle different units for the same rdn?). [Ref. to Unidata Users Guide](https://docs.unidata.ucar.edu/nug/current/best_practices.html#bp_Variable-Grouping).
- note on time encoding
- variable attributes: https://docs.unidata.ucar.edu/netcdf-c/current/attribute_conventions.html
- missing values
- are valid_min, valid_max variable attributes relevant?
- compression + `least_significant_digit`
- Geotraces Quality Check flags maybe relevant?

## Utils
To be further moved in dedicated api lib ...

In [None]:
def get_rules(path):
    import yaml
    from yaml.loader import SafeLoader
    
    with open(path) as f:
        return yaml.load(f, Loader=SafeLoader)

## Load tables

The data is provided as a single `.xls` file. A preview of this file, e.g using [Open Office](http://www.openoffice.org/), indicates the presence of two datasets in a single sheet: the measurements and their locations that can be joined using the `Sampling point number` column.

### Measurements

In [None]:
fname = '../../_data/xls/tepco/coastal_water.xlsx'

In [None]:
df = pd.read_excel(fname, skiprows=1, nrows=23643,
                   converters={'Sampling time': lambda x: x if x != '' else '00:00:00'}); df.head()

Unnamed: 0,Sampling date,Sampling time,Sampling point number,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),...,144Ce detection limit (Bq/L),54Mn radioactivity concentration (Bq/L),54Mn detection limit (Bq/L),3H radioactivity concentration (Bq/L),3H detection limit (Bq/L),125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49
0,2011-03-21,23:15:00,T-3,,1100,13,48,9.2,53,8.8,...,,,,,,,,,,
1,2011-03-21,23:45:00,T-4,,660,12,31,8.7,33,8.3,...,,,,,,,,,,
2,2011-03-22,14:28:00,T-3,,1100,20,46,14.0,40,14.0,...,,,,,,,,,,
3,2011-03-22,15:06:00,T-4,,670,19,39,11.0,44,11.0,...,,,,,,,,,,
4,2011-03-23,13:51:00,T-3,,740,27,51,20.0,55,20.0,...,,,,,,,,34.0,25.0,


### `Location` data

In [None]:
df_loc = pd.read_excel(fname, skiprows=23647, nrows=48, usecols=[0,1,2]); df_loc.head()

Unnamed: 0,Sampling point number,Sampling coordinate North latitude (Decimal),Sampling coordinate East longitude (Decimal)
0,T-0,37.42,141.04
1,T-11,37.24,141.05
2,T-12,37.15,141.04
3,T-13-1,37.64,141.04
4,T-14,37.55,141.06


### Data join

In [None]:
# Join
df = pd.merge(df, df_loc, how="inner", on='Sampling point number')
df.drop(columns=['Sampling point number'], inplace=True)
df.index.name = 'sample'; df.head()

Unnamed: 0_level_0,Sampling date,Sampling time,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),...,54Mn detection limit (Bq/L),3H radioactivity concentration (Bq/L),3H detection limit (Bq/L),125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,Sampling coordinate North latitude (Decimal),Sampling coordinate East longitude (Decimal)
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2011-03-21,23:15:00,,1100,13,48,9.2,53,8.8,160,...,,,,,,,,,37.32,141.03
1,2011-03-22,14:28:00,,1100,20,46,14.0,40,14.0,ND,...,,,,,,,,,37.32,141.03
2,2011-03-23,13:51:00,,740,27,51,20.0,55,20.0,200,...,,,,,,34.0,25.0,,37.32,141.03
3,2011-03-24,09:30:00,,1100,52,99,38.0,94,41.0,120,...,,,,,,56.0,44.0,,37.32,141.03
4,2011-03-25,10:00:00,,430,10,26,7.4,34,5.9,58,...,,,,,,,,,37.32,141.03


## Data preparation

### Address missing values

In [None]:
# Not detected. What should we do? 
# We have the detected field
df[df == 'ND'] = np.nan

### Parse time

In [None]:
def get_datetime(col):
    day = str(col[0].date())
    time = str(col[1])
    return datetime.strptime(day + ' ' + time, '%Y-%m-%d %H:%M:%S')

time_cols = ['Sampling date', 'Sampling time']
df['time'] = df[time_cols].apply(get_datetime, axis=1)
df.drop(columns=time_cols)

Unnamed: 0_level_0,Collection layer of seawater,131I radioactivity concentration (Bq/L),131I detection limit (Bq/L),134Cs radioactivity concentration (Bq/L),134Cs detection limit (Bq/L),137Cs radioactivity concentration (Bq/L),137Cs detection limit (Bq/L),132I radioactivity concentration (Bq/L),132I detection limit (Bq/L),132Te radioactivity concentration (Bq/L),...,3H radioactivity concentration (Bq/L),3H detection limit (Bq/L),125Sb radioactivity concentration (Bq/L),125Sb detection limit (Bq/L),105Ru radioactivity concentration (Bq/L),105Ru detection limit (Bq/L),Unnamed: 49,Sampling coordinate North latitude (Decimal),Sampling coordinate East longitude (Decimal),time
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,1100,13,48,9.2,53,8.8,160,44.0,,...,,,,,,,,37.32,141.03,2011-03-21 23:15:00
1,,1100,20,46,14,40,14,,88.0,,...,,,,,,,,37.32,141.03,2011-03-22 14:28:00
2,,740,27,51,20,55,20,200,58.0,,...,,,,,34.0,25.0,,37.32,141.03,2011-03-23 13:51:00
3,,1100,52,99,38,94,41,120,88.0,,...,,,,,56.0,44.0,,37.32,141.03,2011-03-24 09:30:00
4,,430,10,26,7.4,34,5.9,58,22.0,13.0,...,,,,,,,,37.32,141.03,2011-03-25 10:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21472,上層,,,,0.0011,0.011,,,,,...,,,,,,,,37.48,141.04,2022-07-12 10:10:00
21473,上層,,,,,,,,,,...,,0.37,,,,,,37.48,141.04,2022-07-12 10:10:00
21474,上層,,,,0.0013,0.01,,,,,...,,,,,,,,37.48,141.04,2022-07-19 10:00:00
21475,上層,,,,,,,,,,...,,0.38,,,,,,37.48,141.04,2022-07-19 10:00:00


In [None]:
# Encoding time as seconds since ...
format_time = lambda x: date2num(x, units="seconds since 1960-01-01 00:00:00.0")
df['time'] = df['time'].apply(format_time)

### Rename columns

In [None]:
def normalize_cols(df, rules, verbose=True):
    idx = [col in rules for col in df.columns]
    if verbose:
        print('Found rules:')
        print(50*'-')
        found_rules = [(col, rules[col]) for col in df.columns if col in rules]
        for k, v in found_rules:
            print(k, '->', v)
            
        print('\nNot found column(s):')
        print(50*'-')
        not_found_rules = [col for col in df.columns if col not in rules]
        for col in not_found_rules:
            print(col)    
    return df.loc[:, idx].rename(columns=rules)
    
df_norm = normalize_cols(df, get_rules('rules-simplified.yml')['tepco'])

Found rules:
--------------------------------------------------
131I radioactivity concentration (Bq/L) -> i131
134Cs radioactivity concentration (Bq/L) -> cs134
137Cs radioactivity concentration (Bq/L) -> cs137
239Pu+240Pu radioactivity concentration (Bq/L) -> pu239_240_ratio
Total alpha radioactivity concentration (Bq/L) -> alpha_tot
Total beta radioactivity concentration (Bq/L) -> beta_tot
Sampling coordinate North latitude (Decimal) -> latitude
Sampling coordinate East longitude (Decimal) -> longitude
time -> time

Not found column(s):
--------------------------------------------------
Sampling date
Sampling time
Collection layer of seawater
131I detection limit (Bq/L)
134Cs detection limit (Bq/L)
137Cs detection limit (Bq/L)
132I radioactivity concentration (Bq/L)
132I detection limit (Bq/L)
132Te radioactivity concentration (Bq/L)
132Te detection limit (Bq/L)
136Cs radioactivity concentration (Bq/L)
136Cs detection limit (Bq/L)
140La radioactivity concentration (Bq/L)
140La detec

In [None]:
df_norm.head()

Unnamed: 0_level_0,i131,cs134,cs137,pu239_240_ratio,alpha_tot,beta_tot,latitude,longitude,time
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,1100,48,53,,,,37.32,141.03,1616368500
1,1100,46,40,,,,37.32,141.03,1616423280
2,740,51,55,,,,37.32,141.03,1616507460
3,1100,99,94,,,,37.32,141.03,1616578200
4,430,26,34,,,,37.32,141.03,1616666400


In [None]:
df_norm.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21477 entries, 0 to 21476
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   i131             164 non-null    object 
 1   cs134            6472 non-null   object 
 2   cs137            15717 non-null  object 
 3   pu239_240_ratio  33 non-null     object 
 4   alpha_tot        0 non-null      object 
 5   beta_tot         71 non-null     object 
 6   latitude         21477 non-null  float64
 7   longitude        21477 non-null  float64
 8   time             21477 non-null  int64  
dtypes: float64(2), int64(1), object(6)
memory usage: 1.6+ MB


In [None]:
df_norm['i131'].astype(float)

sample
0        1100.0
1        1100.0
2         740.0
3        1100.0
4         430.0
          ...  
21472       NaN
21473       NaN
21474       NaN
21475       NaN
21476       NaN
Name: i131, Length: 21477, dtype: float64

### Encoding to NetCDF

In [None]:
cfg = {
    'gobal': {
        'description': 'TEPCO ...',
        'summary': '...',
    },
    'groups': ['seawater']
}

ref: https://stackoverflow.com/questions/15141563/python-netcdf-making-a-copy-of-all-variables-and-attributes-but-one

In [None]:
with Dataset('maris-template.nc') as src, Dataset('output/tepco.nc', 'w') as dst:
    # copy global attributes all at once via dictionary
    dst.setncatts(src.__dict__)
    
    # copy dimensions
    for name, dimension in src.dimensions.items():
        dst.createDimension(
            name, (len(dimension) if not dimension.isunlimited() else None))

    # copy groups of interest
    grps = [(name, grp) for name, grp in src.groups.items() if name in cfg['groups']]
    for name_src, grp_src in grps:
        grp_dst = dst.createGroup(name_src)
        
        # copy all variables of interest and fill them
        for name_var_src, var_src in grp_src.variables.items():
            if name_var_src in df_norm.reset_index().columns:
                x = grp_dst.createVariable(name_var_src, var_src.datatype, var_src.dimensions,
                                           compression='zlib', complevel=9)
                # fill variables
                grp_dst[name_var_src][:] = df_norm.reset_index()[name_var_src].values
                # copy variable attributes all at once via dictionary
                grp_dst[name_var_src].setncatts(grp_src[name_var_src].__dict__)

### Reading a NetCDF

And getting back a pandas data frame

In [None]:
with Dataset('output/tepco.nc') as ds:
    #print(ds.groups['seawater'])
    print('Cs137: ',  ds.groups['seawater'].variables['cs137'][:])
    print('longitude: ',  ds.groups['seawater'].variables['longitude'][:])
    #print(df.variables)

Cs137:  [5.3e+01 4.0e+01 5.5e+01 ... 1.0e-02     nan 1.5e-02]
longitude:  [141.03 141.03 141.03 ... 141.04 141.04 141.04]


### To MARIS DB
- Needs pivoting from wide to long

In [None]:
# excludes detection detection limit and err columns for now
#colnames = [name for name in df.columns if re.search('_dl|_err', name) is None]

In [None]:
# gets nuclide cols only
#nucl_cols = [name for name in df[colnames].columns if name not in ['datemeas', 'latitude', 'longitude']]

In [None]:
#nucl_cols

In [None]:
# Wide -> long
#pd.melt(df[colnames].reset_index(), 
#        id_vars=['sample_id', 'datemeas', 'latitude', 'longitude'],
#        value_vars=nucl_cols,
#        value_name='activity',
#        var_name='nuclide',
#       ).sort_values(by='sample_id')