# HELCOM
> Data pipeline (handler) to convert HELCOM data ([source](https://helcom.fi/about-us)) to `NetCDF` format

## Packages import

In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
import pandas as pd
import numpy as np

from netCDF4 import Dataset
from datetime import datetime, timedelta
from cftime import num2date, date2num
from pathlib import Path
from datetime import datetime
import re

from marisco.utils import has_valid_varname, get_bbox
from marisco.serializers import to_netcdf
from marisco.configs import get_nc_tpl_path, get_cfgs
from marisco.metadata import Metadata, ZoteroItem

NC_TPL_PATH = get_nc_tpl_path()

The data is provided as a Microsoft Access database. `Mdbtools` (https://github.com/mdbtools/mdbtools) is used to convert tables into `.csv` files on Unix-like OS.

## Parameters

In [None]:
#| params
fname_in = '../../_data/accdb/mors/csv'
fname_out = '../../_data/output/helcom.nc'

## Utils

In [None]:
def load_helcom(src_dir, 
                smp_types=['SEA', 'SED', 'BIO']):
    "Load HELCOM data and return them as individual dataframe by sample type"
    dfs = {}
    lut_smp_type = {'SEA': 'seawater', 'SED': 'sediment', 'BIO':'biota'}
    for smp_type in smp_types:
        fname_meas = smp_type + '02.csv'
        fname_smp = smp_type + '01.csv'
        df = pd.merge(pd.read_csv(Path(src_dir)/fname_meas), # measurements
                      pd.read_csv(Path(src_dir)/fname_smp), # sample
                      on='KEY', how='left')
        dfs[lut_smp_type[smp_type]] = df
    return dfs

def rename_cols(cols):
    "Flatten multiindex columns"
    new_cols = []
    for outer, inner in cols:
        if not inner:
            new_cols.append(outer)
        else:
            if outer == 'unc':
                new_cols.append(inner + '_' + outer)
            if outer == 'value':
                new_cols.append(inner)
    return new_cols

## Configs

In [None]:
CONFIGS = {
    'global_attr': {}
}

## Load tables

In [None]:
dfs = load_helcom(fname_in)

In [None]:
dfs['seawater'].head()

Unnamed: 0,KEY,NUCLIDE,METHOD,< VALUE_Bq/m³,VALUE_Bq/m³,ERROR%_m³,DATE_OF_ENTRY_x,COUNTRY,LABORATORY,SEQUENCE,...,LONGITUDE (ddmmmm),LONGITUDE (dddddd),TDEPTH,SDEPTH,SALIN,TTEMP,FILT,MORS_SUBBASIN,HELCOM_SUBBASIN,DATE_OF_ENTRY_y
0,WKRIL2012003,CS137,,,5.3,32.0,08/20/14 00:00:00,90,KRIL,2012003,...,29.2,29.3333,,0.0,,,,11,11,08/20/14 00:00:00
1,WKRIL2012004,CS137,,,19.9,20.0,08/20/14 00:00:00,90,KRIL,2012004,...,29.2,29.3333,,29.0,,,,11,11,08/20/14 00:00:00
2,WKRIL2012005,CS137,,,25.5,20.0,08/20/14 00:00:00,90,KRIL,2012005,...,23.09,23.15,,0.0,,,,11,3,08/20/14 00:00:00
3,WKRIL2012006,CS137,,,17.0,29.0,08/20/14 00:00:00,90,KRIL,2012006,...,27.59,27.9833,,0.0,,,,11,11,08/20/14 00:00:00
4,WKRIL2012007,CS137,,,22.2,18.0,08/20/14 00:00:00,90,KRIL,2012007,...,27.59,27.9833,,39.0,,,,11,11,08/20/14 00:00:00


In [None]:
dfs['biota'].head()

Unnamed: 0,KEY,NUCLIDE,METHOD,< VALUE_Bq/kg,VALUE_Bq/kg,BASIS,ERROR%,NUMBER,DATE_OF_ENTRY_x,COUNTRY,...,BIOTATYPE,TISSUE,NO,LENGTH,WEIGHT,DW%,LOI%,MORS_SUBBASIN,HELCOM_SUBBASIN,DATE_OF_ENTRY_y
0,BVTIG2012041,CS134,VTIG01,<,0.01014,W,,,02/27/14 00:00:00,6.0,...,F,5,16.0,45.7,948.0,18.453,92.9,2,16,02/27/14 00:00:00
1,BVTIG2012041,K40,VTIG01,,135.3,W,3.57,,02/27/14 00:00:00,6.0,...,F,5,16.0,45.7,948.0,18.453,92.9,2,16,02/27/14 00:00:00
2,BVTIG2012041,CO60,VTIG01,<,0.01398,W,,,02/27/14 00:00:00,6.0,...,F,5,16.0,45.7,948.0,18.453,92.9,2,16,02/27/14 00:00:00
3,BVTIG2012041,CS137,VTIG01,,4.338,W,3.48,,02/27/14 00:00:00,6.0,...,F,5,16.0,45.7,948.0,18.453,92.9,2,16,02/27/14 00:00:00
4,BVTIG2012040,CS134,VTIG01,<,0.009614,W,,,02/27/14 00:00:00,6.0,...,F,5,17.0,45.9,964.0,18.458,92.9,2,16,02/27/14 00:00:00


In [None]:
pd.to_numeric(df.reset_index()[name_var_src], errors='coerce', downcast=None)

In [None]:
dfs['biota']['BIOTATYPE'].unique()

array(['F', 'B', 'P'], dtype=object)

In [None]:
dfs['biota']['TISSUE'].unique()

array([ 5,  1, 41,  3, 51, 43, 42, 12, 10, 18, 52, 20,  8, 54, 53])

In [None]:
dfs['sediment'].head()

Unnamed: 0,KEY,NUCLIDE,METHOD,< VALUE_Bq/kg,VALUE_Bq/kg,ERROR%_kg,< VALUE_Bq/m²,VALUE_Bq/m²,ERROR%_m²,DATE_OF_ENTRY_x,...,LOWSLI,AREA,SEDI,OXIC,DW%,LOI%,MORS_SUBBASIN,HELCOM_SUBBASIN,SUM_LINK,DATE_OF_ENTRY_y
0,SKRIL2012048,RA226,,,35.0,26.0,,,,08/20/14 00:00:00,...,20.0,0.006,,,,,11.0,11.0,,08/20/14 00:00:00
1,SKRIL2012049,RA226,,,36.0,22.0,,,,08/20/14 00:00:00,...,27.0,0.006,,,,,11.0,11.0,,08/20/14 00:00:00
2,SKRIL2012050,RA226,,,38.0,24.0,,,,08/20/14 00:00:00,...,2.0,0.006,,,,,11.0,11.0,,08/20/14 00:00:00
3,SKRIL2012051,RA226,,,36.0,25.0,,,,08/20/14 00:00:00,...,4.0,0.006,,,,,11.0,11.0,,08/20/14 00:00:00
4,SKRIL2012052,RA226,,,30.0,23.0,,,,08/20/14 00:00:00,...,6.0,0.006,,,,,11.0,11.0,,08/20/14 00:00:00


## Data preparation

### Normalize radionuclide names

We first normalize `NUCLIDE` names so that we can consistently remap them to expected NetCDF MARIS CDL naming conventions.

In [None]:
# Converting nuclides name to lower case and strip any trailing space(s)
for k in dfs.keys(): 
    dfs[k]['NUCLIDE'] = dfs[k]['NUCLIDE'].apply(lambda x: x.lower().strip())

Let's get the list of unique radionuclide types measured across samples:

In [None]:
def get_unique_nuclides(dfs):
    nuclides = []
    for k in dfs.keys():
        nuclides += dfs[k]['NUCLIDE'].unique().tolist()
    return nuclides

get_unique_nuclides(dfs)[:5]

['cs137', 'sr90', 'h3', 'cs134', 'pu238']

In [None]:
# Check if these variable names consistent with MARIS CDL
has_valid_varname(get_unique_nuclides(dfs), NC_TPL_PATH)

"pu239240" variable name not found in MARIS CDL
"cm243244" variable name not found in MARIS CDL
"cs134137" variable name not found in MARIS CDL
"pu239240" variable name not found in MARIS CDL
"pu238240" variable name not found in MARIS CDL
"pu239240" variable name not found in MARIS CDL
"cs134137" variable name not found in MARIS CDL
"k-40" variable name not found in MARIS CDL
"cs138" variable name not found in MARIS CDL
"cs139" variable name not found in MARIS CDL
"cs140" variable name not found in MARIS CDL
"cs141" variable name not found in MARIS CDL
"cs142" variable name not found in MARIS CDL
"cs143" variable name not found in MARIS CDL
"cs144" variable name not found in MARIS CDL
"cs145" variable name not found in MARIS CDL
"cs146" variable name not found in MARIS CDL


False

In [None]:
varnames_lut = {n: n for n in set(get_unique_nuclides(dfs))}

In [None]:
# Renaming above mentioned nuclides accordingly
varnames_lut['k-40'] = 'k40'
varnames_lut['cm243244'] = 'cm243_244_tot'
varnames_lut['cs134137'] = 'cs134_137_tot'
varnames_lut['pu239240'] = 'pu239_240_tot'
varnames_lut['pu238240'] = 'pu238_240_tot'

In [None]:
# Update nuclide names accordingly
for k in dfs.keys(): dfs[k]['NUCLIDE'].replace(varnames_lut, inplace=True)

In [None]:
has_valid_varname(get_unique_nuclides(dfs), NC_TPL_PATH)

"cs138" variable name not found in MARIS CDL
"cs139" variable name not found in MARIS CDL
"cs140" variable name not found in MARIS CDL
"cs141" variable name not found in MARIS CDL
"cs142" variable name not found in MARIS CDL
"cs143" variable name not found in MARIS CDL
"cs144" variable name not found in MARIS CDL
"cs145" variable name not found in MARIS CDL
"cs146" variable name not found in MARIS CDL


False

In [None]:
dfs['sediment'].columns

Index(['KEY', 'NUCLIDE', 'METHOD', '< VALUE_Bq/kg', 'VALUE_Bq/kg', 'ERROR%_kg',
       '< VALUE_Bq/m²', 'VALUE_Bq/m²', 'ERROR%_m²', 'DATE_OF_ENTRY_x',
       'COUNTRY', 'LABORATORY', 'SEQUENCE', 'DATE', 'YEAR', 'MONTH', 'DAY',
       'STATION', 'LATITUDE (ddmmmm)', 'LATITUDE (dddddd)',
       'LONGITUDE (ddmmmm)', 'LONGITUDE (dddddd)', 'DEVICE', 'TDEPTH',
       'UPPSLI', 'LOWSLI', 'AREA', 'SEDI', 'OXIC', 'DW%', 'LOI%',
       'MORS_SUBBASIN', 'HELCOM_SUBBASIN', 'SUM_LINK', 'DATE_OF_ENTRY_y'],
      dtype='object')

In [None]:
dfs['biota'].columns

Index(['KEY', 'NUCLIDE', 'METHOD', '< VALUE_Bq/kg', 'VALUE_Bq/kg', 'BASIS',
       'ERROR%', 'NUMBER', 'DATE_OF_ENTRY_x', 'COUNTRY', 'LABORATORY',
       'SEQUENCE', 'DATE', 'YEAR', 'MONTH', 'DAY', 'STATION',
       'LATITUDE ddmmmm', 'LATITUDE dddddd', 'LONGITUDE ddmmmm',
       'LONGITUDE dddddd', 'SDEPTH', 'RUBIN', 'BIOTATYPE', 'TISSUE', 'NO',
       'LENGTH', 'WEIGHT', 'DW%', 'LOI%', 'MORS_SUBBASIN', 'HELCOM_SUBBASIN',
       'DATE_OF_ENTRY_y'],
      dtype='object')

In [None]:
dfs['sediment'].columns

Index(['KEY', 'NUCLIDE', 'METHOD', '< VALUE_Bq/kg', 'VALUE_Bq/kg', 'ERROR%_kg',
       '< VALUE_Bq/m²', 'VALUE_Bq/m²', 'ERROR%_m²', 'DATE_OF_ENTRY_x',
       'COUNTRY', 'LABORATORY', 'SEQUENCE', 'DATE', 'YEAR', 'MONTH', 'DAY',
       'STATION', 'LATITUDE (ddmmmm)', 'LATITUDE (dddddd)',
       'LONGITUDE (ddmmmm)', 'LONGITUDE (dddddd)', 'DEVICE', 'TDEPTH',
       'UPPSLI', 'LOWSLI', 'AREA', 'SEDI', 'OXIC', 'DW%', 'LOI%',
       'MORS_SUBBASIN', 'HELCOM_SUBBASIN', 'SUM_LINK', 'DATE_OF_ENTRY_y'],
      dtype='object')

In [None]:
dfs['sediment']['SEDI'].unique()

array([ nan, -99.,   0.,  55.,  11.,  57.,  51.,  52.,  22.,  10.,  44.,
         5.,  50.,  15.,   1.,  40.,  33.,  43.,  59.,  54.,   9.,  45.,
        14.,  41.,  25.,  42.,  24.,  12.,  58.,  13.,   7.,  49.,  48.,
         4.,  47.,  23.,  20.,  46.,   2.,  34.,  32.,  56.,  35.,  73.,
        21.])

### Parse date

In [None]:
for k in dfs.keys():
    dfs[k]['time'] = pd.to_datetime(dfs[k].DATE, infer_datetime_format=True)  

## Data transformation

In [None]:
# Make measurement and uncertainty units consistent
def fix_units(df, meas_col, unc_col):
    return df.apply(lambda row: 0.1*row[unc_col] * row[meas_col], axis=1)    

arr = [('seawater', 'VALUE_Bq/m³', 'ERROR%_m³'), 
       ('biota', 'VALUE_Bq/kg', 'ERROR%'),
       ('sediment', 'VALUE_Bq/kg', 'ERROR%_kg')]
 
for grp, val, unc in arr: 
    dfs[grp][unc] = fix_units(dfs[grp], val, unc)

In [None]:
# Define columns of interest by sample type
cols_of_interest = {'seawater': ['NUCLIDE', 'VALUE_Bq/m³', 'ERROR%_m³', 'time',
                                 'TDEPTH', 'LATITUDE (dddddd)', 'LONGITUDE (dddddd)'],
                    'sediment': ['NUCLIDE', 'VALUE_Bq/kg', 'ERROR%_kg', 'time',
                                 'TDEPTH', 'LATITUDE (dddddd)', 'LONGITUDE (dddddd)', 
                                 'SEDI'],
                    'biota': ['NUCLIDE', 'VALUE_Bq/kg', 'ERROR%', 'time',
                              'SDEPTH', 'LATITUDE ddmmmm', 'LONGITUDE ddmmmm', 
                              'BIOTATYPE', 'TISSUE']}

In [None]:
# Define column names renaming rules
renaming_rules = {
    'NUCLIDE': 'nuclide',
    'VALUE_Bq/m³': 'value',
    'VALUE_Bq/kg': 'value',
    'ERROR%_m³': 'unc',
    'ERROR%_kg': 'unc',
    'ERROR%': 'unc',
    'TDEPTH': 'depth',
    'SDEPTH': 'depth',
    'LATITUDE (dddddd)':'lat',
    'LATITUDE ddmmmm': 'lat',
    'LONGITUDE (dddddd)':'lon',
    'LONGITUDE ddmmmm': 'lon',
    # group specific
    'BIOTATYPE': 'bio_group',
    'TISSUE': 'body_part',
    'SEDI': 'sed_type'
}

In [None]:
for k in dfs.keys():
    # Select cols of interest
    dfs[k] = dfs[k].loc[:, cols_of_interest[k]]
    
    # Rename cols
    dfs[k].rename(columns=renaming_rules, inplace=True)
    
    # Pivot
    cols = ['nuclide']
    vals = ['value', 'unc']
    idx = list(set(dfs[k].columns) - set(cols + vals)) # All others
    
    dfs[k] = dfs[k].pivot_table(index=idx, 
                                columns=cols, values=vals).reset_index()
    
    # Flatten cols name
    dfs[k].columns = rename_cols(dfs[k].columns)
    
    # Set index
    dfs[k].index.name = 'sample'
    
    # Encode time as seconds since ...
    format_time = lambda x: date2num(x, units=get_cfgs('units')['time'])
    dfs[k]['time'] = dfs[k]['time'].apply(format_time)

In [None]:
dfs['seawater'].head()

Unnamed: 0_level_0,lon,time,lat,depth,ag110m_unc,am241_unc,ba140_unc,ce144_unc,cm242_unc,cm243_244_tot_unc,...,pu240,ru103,ru106,sb125,sr89,sr90,tc99,u234,u238,zr95
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,1429574400,0.0,12.0,,,,,,,...,,,,,,5.759,,,,
1,0.0,1429747200,0.0,4.0,,,,,,,...,,,,,,6.758,,,,
2,0.0,1430352000,0.0,13.0,,,,,,,...,,,,,,,,,,
3,0.0,1431993600,0.0,81.0,,,,,,,...,,,,,,,,,,
4,0.0,1432080000,0.0,69.0,,,,,,,...,,,,,,7.255,,,,


In [None]:
dfs['biota'].head()

Unnamed: 0_level_0,body_part,bio_group,time,lat,depth,lon,ac228_unc,ag108m_unc,ag110m_unc,am241_unc,...,sr89,sr90,tc99,te129m,th228,th232,tl208,u235,zn65,zr95
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,B,630028800,54.33,50.6,15.2,,,0.0,,...,,,,,,,,,,
1,1,B,1066176000,54.13,0.0,13.43,,,,0.005992,...,,,,,,,,,,
2,1,F,519955200,54.5,0.0,17.321,,,,,...,,,,,,,,,,
3,1,F,534211200,54.3,19.0,10.37,,,0.16288,,...,,0.021,,,,,,,,
4,1,F,534211200,54.32,20.5,10.08,,,0.27692,,...,,0.0602,,,,,,,,


In [None]:
# Dropping wrong longitudes and latitudes when both == 0
dfs = {grp: (df[(df.lon != 0) & (df.lat != 0)]) for grp, df in dfs.items()}

## Encoding

In [None]:
# Populating NetCDF global attributes metadata
item_id = '26VMZZ2Q'
global_attrs = Metadata(dfs)
CONFIGS['global_attr'] = global_attrs.fill(ZoteroItem(item_id, get_cfgs('zotero'))); CONFIGS['global_attr']

{'id': '',
 'title': 'Environmental database - Helsinki Commission Monitoring of Radioactive Substances',
 'summary': 'MORS Environment database has been used to collate data resulting from monitoring of environmental radioactivity in the Baltic Sea based on HELCOM Recommendation 26/3.\n\nThe database is structured according to HELCOM Guidelines on Monitoring of Radioactive Substances (https://www.helcom.fi/wp-content/uploads/2019/08/Guidelines-for-Monitoring-of-Radioactive-Substances.pdf), which specifies reporting format, database structure, data types and obligatory parameters used for reporting data under Recommendation 26/3.\n\nThe database is updated and quality assured annually by HELCOM MORS EG.',
 'keywords': '',
 'keywords_vocabulary': 'GCMD Science Keywords',
 'keywords_vocabulary_url': 'https://gcmd.earthdata.nasa.gov/static/kms/',
 'record': '',
 'featureType': '',
 'cdm_data_type': '',
 'Conventions': 'CF-1.10 ACDD-1.3',
 'publisher_name': 'Paul MCGINNITY, Iolanda OSVATH,

In [None]:
kw = ['oceanography', 'Earth Science > Oceans > Ocean Chemistry> Radionuclides',
      'Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure',
      'Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments',
      'Earth Science > Oceans > Ocean Chemistry, Earth Science > Oceans > Sea Ice > Isotopes',
      'Earth Science > Oceans > Water Quality > Ocean Contaminants',
      'Earth Science > Biological Classification > Animals/Vertebrates > Fish',
      'Earth Science > Biosphere > Ecosystems > Marine Ecosystems',
      'Earth Science > Biological Classification > Animals/Invertebrates > Mollusks',
      'Earth Science > Biological Classification > Animals/Invertebrates > Arthropods > Crustaceans',
      'Earth Science > Biological Classification > Plants > Macroalgae (Seaweeds)']

In [None]:
CONFIGS['global_attr']['keywords'] = ', '.join(kw); CONFIGS

{'global_attr': {'id': '',
  'title': 'Environmental database - Helsinki Commission Monitoring of Radioactive Substances',
  'summary': 'MORS Environment database has been used to collate data resulting from monitoring of environmental radioactivity in the Baltic Sea based on HELCOM Recommendation 26/3.\n\nThe database is structured according to HELCOM Guidelines on Monitoring of Radioactive Substances (https://www.helcom.fi/wp-content/uploads/2019/08/Guidelines-for-Monitoring-of-Radioactive-Substances.pdf), which specifies reporting format, database structure, data types and obligatory parameters used for reporting data under Recommendation 26/3.\n\nThe database is updated and quality assured annually by HELCOM MORS EG.',
  'keywords': 'oceanography, Earth Science > Oceans > Ocean Chemistry> Radionuclides, Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure, Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sedi

### To NetCDF

In [None]:
def units_fn(grp_name,
             rdn_name):
    lut = {'seawater': 'Bq/m³', 
           'sediment': 'Bq/kg', 
           'biota': 'Bq/kg'}
    return lut[grp_name]

In [None]:
to_netcdf(dfs, NC_TPL_PATH, fname_out, CONFIGS, units_fn)

% of discarded data for grp seawater: 0.0
% of discarded data for grp sediment: 0.0
% of discarded data for grp biota: 7.010663611527276
