# HELCOM
> Data pipeline (handler) to convert HELCOM data ([source](https://helcom.fi/about-us)) to `NetCDF` format

## Packages import

In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
import pandas as pd
import numpy as np

from netCDF4 import Dataset
from datetime import datetime, timedelta
from cftime import num2date, date2num
from pathlib import Path

from marisco.utils import has_valid_varname
from marisco.serializers import to_netcdf
from marisco.configs import get_nc_tpl_path
from datetime import datetime
import re

NC_TPL_PATH = get_nc_tpl_path()

The data is provided as a Microsoft Access database. `Mdbtools` (https://github.com/mdbtools/mdbtools) is used to convert tables into `.csv` files on 

## Parameters

In [None]:
#| params
fname_in = '../../_data/accdb/mors/csv'
fname_out = f'../../_data/output/helcom.nc'

## Utils

In [None]:
def load_helcom(src_dir, 
                smp_types=['SEA', 'SED', 'BIO']):
    "Load HELCOM data and return them as individual dataframe by sample type"
    dfs = {}
    lut_smp_type = {'SEA': 'seawater', 'SED': 'sediment', 'BIO':'biota'}
    for smp_type in smp_types:
        fname_meas = smp_type + '02.csv'
        fname_smp = smp_type + '01.csv'
        df = pd.merge(pd.read_csv(Path(src_dir)/fname_meas), # measurements
                      pd.read_csv(Path(src_dir)/fname_smp), # sample
                      on='KEY', how='left')
        dfs[lut_smp_type[smp_type]] = df
    return dfs

def rename_cols(cols):
    "Flatten multiindex columns"
    new_cols = []
    for outer, inner in cols:
        if not inner:
            new_cols.append(outer)
        else:
            if outer == 'unc':
                new_cols.append(inner + '_' + outer)
            if outer == 'value':
                new_cols.append(inner)
    return new_cols

## Configs

In [None]:
CONFIGS = {
    'global_attr': {
        'description': 'HELCOM dataset ...',
        'summary': '...',
        'keyword': 'HELCOM seawater biota sediment radionuclides',
        'license': 'tbd',
    }
}

## Load tables

In [None]:
dfs = load_helcom(fname_in)

In [None]:
dfs['seawater'].head()

Unnamed: 0,KEY,NUCLIDE,METHOD,< VALUE_Bq/m³,VALUE_Bq/m³,ERROR%_m³,DATE_OF_ENTRY_x,COUNTRY,LABORATORY,SEQUENCE,...,LONGITUDE (ddmmmm),LONGITUDE (dddddd),TDEPTH,SDEPTH,SALIN,TTEMP,FILT,MORS_SUBBASIN,HELCOM_SUBBASIN,DATE_OF_ENTRY_y
0,WKRIL2012003,CS137,,,5.3,32.0,08/20/14 00:00:00,90,KRIL,2012003,...,29.2,29.3333,,0.0,,,,11,11,08/20/14 00:00:00
1,WKRIL2012004,CS137,,,19.9,20.0,08/20/14 00:00:00,90,KRIL,2012004,...,29.2,29.3333,,29.0,,,,11,11,08/20/14 00:00:00
2,WKRIL2012005,CS137,,,25.5,20.0,08/20/14 00:00:00,90,KRIL,2012005,...,23.09,23.15,,0.0,,,,11,3,08/20/14 00:00:00
3,WKRIL2012006,CS137,,,17.0,29.0,08/20/14 00:00:00,90,KRIL,2012006,...,27.59,27.9833,,0.0,,,,11,11,08/20/14 00:00:00
4,WKRIL2012007,CS137,,,22.2,18.0,08/20/14 00:00:00,90,KRIL,2012007,...,27.59,27.9833,,39.0,,,,11,11,08/20/14 00:00:00


In [None]:
dfs['biota'].head()

Unnamed: 0,KEY,NUCLIDE,METHOD,< VALUE_Bq/kg,VALUE_Bq/kg,BASIS,ERROR%,NUMBER,DATE_OF_ENTRY_x,COUNTRY,...,BIOTATYPE,TISSUE,NO,LENGTH,WEIGHT,DW%,LOI%,MORS_SUBBASIN,HELCOM_SUBBASIN,DATE_OF_ENTRY_y
0,BVTIG2012041,CS134,VTIG01,<,0.01014,W,,,02/27/14 00:00:00,6.0,...,F,5,16.0,45.7,948.0,18.453,92.9,2,16,02/27/14 00:00:00
1,BVTIG2012041,K40,VTIG01,,135.3,W,3.57,,02/27/14 00:00:00,6.0,...,F,5,16.0,45.7,948.0,18.453,92.9,2,16,02/27/14 00:00:00
2,BVTIG2012041,CO60,VTIG01,<,0.01398,W,,,02/27/14 00:00:00,6.0,...,F,5,16.0,45.7,948.0,18.453,92.9,2,16,02/27/14 00:00:00
3,BVTIG2012041,CS137,VTIG01,,4.338,W,3.48,,02/27/14 00:00:00,6.0,...,F,5,16.0,45.7,948.0,18.453,92.9,2,16,02/27/14 00:00:00
4,BVTIG2012040,CS134,VTIG01,<,0.009614,W,,,02/27/14 00:00:00,6.0,...,F,5,17.0,45.9,964.0,18.458,92.9,2,16,02/27/14 00:00:00


## Data preparation

### Normalize radionuclide names

We first normalize `NUCLIDE` names so that we can consistently remap them to expected NetCDF MARIS CDL naming conventions.

In [None]:
# Converting nuclides name to lower case and strip any trailing space(s)
for k in dfs.keys(): 
    dfs[k]['NUCLIDE'] = dfs[k]['NUCLIDE'].apply(lambda x: x.lower().strip())

Let's get the list of unique radionuclide types measured across samples:

In [None]:
nuclides = []
for k in dfs.keys():
    nuclides += dfs[k]['NUCLIDE'].unique().tolist()

var_names = {n:n for n in set(nuclides)}; var_names
#var_names = set(nuclides); var_names

{'sb124': 'sb124',
 'co60': 'co60',
 'ra226': 'ra226',
 'pb214': 'pb214',
 'eu155': 'eu155',
 'cs142': 'cs142',
 'eu152': 'eu152',
 'sn117m': 'sn117m',
 'pu239240': 'pu239240',
 'sr89': 'sr89',
 'la140': 'la140',
 'th232': 'th232',
 'ba140': 'ba140',
 'cs134': 'cs134',
 'te129m': 'te129m',
 'cs139': 'cs139',
 'bi214': 'bi214',
 'co57': 'co57',
 'cs146': 'cs146',
 'sr90': 'sr90',
 'ra228': 'ra228',
 'pb212': 'pb212',
 'th234': 'th234',
 'pu238': 'pu238',
 'sn113': 'sn113',
 'ru106': 'ru106',
 'ag110m': 'ag110m',
 'tc99': 'tc99',
 'h3': 'h3',
 'zr95': 'zr95',
 'np237': 'np237',
 'cm243244': 'cm243244',
 'cs143': 'cs143',
 'pu240': 'pu240',
 'nb95': 'nb95',
 'tl208': 'tl208',
 'rb86': 'rb86',
 'mn54': 'mn54',
 'cs140': 'cs140',
 'k40': 'k40',
 'sc46': 'sc46',
 'ir192': 'ir192',
 'pu239': 'pu239',
 'cs137': 'cs137',
 'ac228': 'ac228',
 'cs145': 'cs145',
 'cs141': 'cs141',
 'gd153': 'gd153',
 'am241': 'am241',
 'cs144': 'cs144',
 'cs138': 'cs138',
 'cs134137': 'cs134137',
 'pb210': 'pb210',

In [None]:
# Check if these variable names consistent with MARIS CDL
has_valid_varname(var_names, NC_TPL_PATH)

"cs142" variable name not found in MARIS CDL
"pu239240" variable name not found in MARIS CDL
"cs139" variable name not found in MARIS CDL
"cs146" variable name not found in MARIS CDL
"cm243244" variable name not found in MARIS CDL
"cs143" variable name not found in MARIS CDL
"cs140" variable name not found in MARIS CDL
"cs145" variable name not found in MARIS CDL
"cs141" variable name not found in MARIS CDL
"cs144" variable name not found in MARIS CDL
"cs138" variable name not found in MARIS CDL
"cs134137" variable name not found in MARIS CDL
"k-40" variable name not found in MARIS CDL
"pu238240" variable name not found in MARIS CDL


False

In [None]:
# Renaming above mentioned nuclides accordingly
var_names['k-40'] = 'k40'
var_names['cm243244'] = 'cm243_244_tot'
var_names['cs134137'] = 'cs134_137_tot'
var_names['pu239240'] = 'pu239_240_tot'
var_names['pu238240'] = 'pu238_240_tot'

In [None]:
has_valid_varname(var_names, NC_TPL_PATH)

"cs142" variable name not found in MARIS CDL
"cs139" variable name not found in MARIS CDL
"cs146" variable name not found in MARIS CDL
"cs143" variable name not found in MARIS CDL
"cs140" variable name not found in MARIS CDL
"cs145" variable name not found in MARIS CDL
"cs141" variable name not found in MARIS CDL
"cs144" variable name not found in MARIS CDL
"cs138" variable name not found in MARIS CDL


False

In [None]:
dfs['sediment'].columns

Index(['KEY', 'NUCLIDE', 'METHOD', '< VALUE_Bq/kg', 'VALUE_Bq/kg', 'ERROR%_kg',
       '< VALUE_Bq/m²', 'VALUE_Bq/m²', 'ERROR%_m²', 'DATE_OF_ENTRY_x',
       'COUNTRY', 'LABORATORY', 'SEQUENCE', 'DATE', 'YEAR', 'MONTH', 'DAY',
       'STATION', 'LATITUDE (ddmmmm)', 'LATITUDE (dddddd)',
       'LONGITUDE (ddmmmm)', 'LONGITUDE (dddddd)', 'DEVICE', 'TDEPTH',
       'UPPSLI', 'LOWSLI', 'AREA', 'SEDI', 'OXIC', 'DW%', 'LOI%',
       'MORS_SUBBASIN', 'HELCOM_SUBBASIN', 'SUM_LINK', 'DATE_OF_ENTRY_y'],
      dtype='object')

In [None]:
dfs['biota'].columns

Index(['KEY', 'NUCLIDE', 'METHOD', '< VALUE_Bq/kg', 'VALUE_Bq/kg', 'BASIS',
       'ERROR%', 'NUMBER', 'DATE_OF_ENTRY_x', 'COUNTRY', 'LABORATORY',
       'SEQUENCE', 'DATE', 'YEAR', 'MONTH', 'DAY', 'STATION',
       'LATITUDE ddmmmm', 'LATITUDE dddddd', 'LONGITUDE ddmmmm',
       'LONGITUDE dddddd', 'SDEPTH', 'RUBIN', 'BIOTATYPE', 'TISSUE', 'NO',
       'LENGTH', 'WEIGHT', 'DW%', 'LOI%', 'MORS_SUBBASIN', 'HELCOM_SUBBASIN',
       'DATE_OF_ENTRY_y'],
      dtype='object')

In [None]:
dfs['sediment'].columns

Index(['KEY', 'NUCLIDE', 'METHOD', '< VALUE_Bq/kg', 'VALUE_Bq/kg', 'ERROR%_kg',
       '< VALUE_Bq/m²', 'VALUE_Bq/m²', 'ERROR%_m²', 'DATE_OF_ENTRY_x',
       'COUNTRY', 'LABORATORY', 'SEQUENCE', 'DATE', 'YEAR', 'MONTH', 'DAY',
       'STATION', 'LATITUDE (ddmmmm)', 'LATITUDE (dddddd)',
       'LONGITUDE (ddmmmm)', 'LONGITUDE (dddddd)', 'DEVICE', 'TDEPTH',
       'UPPSLI', 'LOWSLI', 'AREA', 'SEDI', 'OXIC', 'DW%', 'LOI%',
       'MORS_SUBBASIN', 'HELCOM_SUBBASIN', 'SUM_LINK', 'DATE_OF_ENTRY_y'],
      dtype='object')

### Parse date

In [None]:
for k in dfs.keys():
    dfs[k]['time'] = pd.to_datetime(dfs[k].DATE, infer_datetime_format=True)  

## Data transformation

In [None]:
cols_of_interest = {'seawater': ['NUCLIDE', 'VALUE_Bq/m³', 'ERROR%_m³', 'time',
                                  'TDEPTH', 'LATITUDE (dddddd)', 'LONGITUDE (dddddd)'],
                    'sediment': ['NUCLIDE', 'VALUE_Bq/kg', 'ERROR%_kg', 'time',
                                 'TDEPTH', 'LATITUDE (dddddd)', 'LONGITUDE (dddddd)'],
                    'biota': ['NUCLIDE', 'VALUE_Bq/kg', 'ERROR%', 'time',
                              'SDEPTH', 'LATITUDE ddmmmm', 'LONGITUDE ddmmmm']}

In [None]:
renaming_rules = {
    'NUCLIDE': 'nuclide',
    'VALUE_Bq/m³': 'value',
    'VALUE_Bq/kg': 'value',
    'ERROR%_m³': 'unc',
    'ERROR%_kg': 'unc',
    'ERROR%': 'unc',
    'TDEPTH': 'depth',
    'SDEPTH': 'depth',
    'LATITUDE (dddddd)':'lat',
    'LATITUDE ddmmmm': 'lat',
    'LONGITUDE (dddddd)':'lon',
    'LONGITUDE ddmmmm': 'lon',
}

In [None]:
for k in dfs.keys():
    # Select cols of interest
    dfs[k] = dfs[k].loc[:, cols_of_interest[k]]
    
    # Rename cols
    dfs[k].rename(columns=renaming_rules, inplace=True)
    
    # Pivot
    dfs[k] = dfs[k].pivot_table(index=['time', 'lat', 'lon', 'depth'], 
                                columns='nuclide', values=['value', 'unc']).reset_index()
    
    # Flatten cols name
    dfs[k].columns = rename_cols(dfs[k].columns)
    
    # Set index
    dfs[k].index.name = 'sample'
    
    # Encode time as seconds since ...
    format_time = lambda x: date2num(x, units="seconds since 1970-01-01 00:00:00.0")
    dfs[k]['time'] = dfs[k]['time'].apply(format_time)

In [None]:
dfs['seawater']

Unnamed: 0_level_0,time,lat,lon,depth,ag110m_unc,am241_unc,ba140_unc,ce144_unc,cm242_unc,cm243244_unc,...,pu240,ru103,ru106,sb125,sr89,sr90,tc99,u234,u238,zr95
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,452217600,54.466700,11.9833,17.0,,,,,,,...,,,,,,27.000,,,,
1,452217600,54.600000,11.0833,23.0,,,,,,,...,,,,,,,,,,
2,452217600,54.866700,10.8333,38.0,,,,,,,...,,,,,,28.000,,,,
3,452217600,54.950000,12.6833,23.0,,,,,,,...,,,,,,,,,,
4,452217600,55.383333,11.0500,22.0,,,,,,,...,,,,,,28.000,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4581,1535068800,59.380000,24.1550,21.0,,,,,,,...,,,,,,,,,,
4582,1535328000,59.305000,23.2880,88.0,,,,,,,...,,,,,,,,,,
4583,1536278400,58.603300,11.2450,15.5,,,,,,,...,,,,,,,,,,
4584,1536969600,59.033300,21.0795,172.0,,,,,,,...,,,,,,6.795,,,,


In [None]:
def units_fn(grp_name,
             rdn_name):
    if grp_name == 'seawater':
        if '_unc' in rdn_name:
            return '%'
        else:
            return 'Bq/m³'
    elif grp_name == 'sediment':
        if '_unc' in rdn_name:
            return '%'
        else:
            return 'Bq/kg'
    elif grp_name == 'biota':
        if '_unc' in rdn_name:
            return '%'
        else:
            return 'Bq/kg'
    else:
        return 'undefined'

In [None]:
to_netcdf(dfs, NC_TPL_PATH, fname_out, CONFIGS, units_fn)

% of discarded data for grp seawater: 0.0
% of discarded data for grp sediment: 0.0
% of discarded data for grp biota: 0.0
