In [None]:
#| default_exp handlers.generic

# Generic
> Generic data pipeline (handler) to convert datasets to `NetCDF` format

The input data is a dump from already imported MARIS datasets.


**Questions**:
1. do we put areaname?
2. when converted to tall to wide a sample might have several nuclide type, but their measurement methods might be different? Maybe put them in metadata or variable attribute?
3. is `decayedto` used and in what context? (question to be answered for all columns actually)
4. what are units of uncertainty?
5. do we keep detection?
6. which columns for which sample type?
7. f(nuclide) or f(nuclide, measurement) ? E.g volume, counmet, unit?
8. need areea_id -> area_name lut to be used as enumtype
9. ref_id vs. zoterourl vs. displaytext
   1.  e.g ref_id=129 and 130 points to the same dataset?

**Dev. board**: https://trello.com/b/IszgV1bj/marisco

## Packages import

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#| export
import pandas as pd
from tqdm import tqdm
from functools import partial
import fastcore.all as fc

from pathlib import Path

from marisco.utils import (has_valid_varname, match_worms)
from marisco.callbacks import (Callback, Transformer,
                               EncodeTimeCB, SanitizeLonLatCB)

from marisco.metadata import (GlobAttrsFeeder, BboxCB,
                              DepthRangeCB, TimeRangeCB,
                              ZoteroCB, KeyValuePairCB)

from marisco.serializers import to_netcdf
# from marisco.configs import get_nc_tpl_path, BASE_PATH, NUCLIDES_LUT
from marisco.configs import get_nc_tpl_path, BASE_PATH

In [None]:
pd.set_option('display.max_rows', 500)

In [None]:
NC_TPL_PATH = get_nc_tpl_path()

In [None]:
fname_in = '../../_data/exploded/3-top-and-clarke-1983.csv' # has salinity and temperature
# fname_in = '../../_data/exploded/6-kautsky-and-eicke-1982.csv' # has volume
# fname_out = '../../_data/output/3-top-and-clarke-1983.nc'

In [None]:
fname_in = Path().home() / 'pro/data/maris/all-maris.txt'

In [None]:
df = pd.read_csv(fname_in, sep='\t', encoding='ISO-8859-1')   

  df = pd.read_csv(fname_in, sep='\t', encoding='ISO-8859-1')


In [None]:
df.head()

Unnamed: 0,sample_id,area_id,areaname,samptype_id,samptype,ref_id,displaytext,zoterourl,ref_note,datbase,...,profile_id,sampnote,ref_fulltext,ref_yearpub,ref_sampleTypes,LongLat,shiftedcoordinates,shiftedlong,shiftedlat,id
0,594163,1904,Indian Ocean,1,Seawater,402,"CCHDO, 2018",https://www.zotero.org/groups/2432820/maris/it...,"Data downloaded from: Oms (2018), Tritium in o...",,...,402.1304,SOURCE FILE NAME: 316N145_7_00398_00001_hy1.nc.,"CCHDO, 2018. CCHDO (CLIVAR and Carbon Hydrogra...",1984,1,"89.373,-31.597",0xE6100000010CF085C954C1983FC0A4703D0AD7575640,89.3725,-31.596667,1
1,594135,1904,Indian Ocean,1,Seawater,402,"CCHDO, 2018",https://www.zotero.org/groups/2432820/maris/it...,"Data downloaded from: Oms (2018), Tritium in o...",,...,402.1303,SOURCE FILE NAME: 33RR20090320_00181_00002_hy1...,"CCHDO, 2018. CCHDO (CLIVAR and Carbon Hydrogra...",1984,1,"109.551,-31.595",0xE6100000010C63EE5A423E983FC02506819543635B40,109.551389,-31.594722,2
2,594143,1904,Indian Ocean,1,Seawater,402,"CCHDO, 2018",https://www.zotero.org/groups/2432820/maris/it...,"Data downloaded from: Oms (2018), Tritium in o...",,...,402.1303,SOURCE FILE NAME: 33RR20090320_00181_00002_hy1...,"CCHDO, 2018. CCHDO (CLIVAR and Carbon Hydrogra...",1984,1,"109.551,-31.595",0xE6100000010C63EE5A423E983FC02506819543635B40,109.551389,-31.594722,3
3,594160,1904,Indian Ocean,1,Seawater,402,"CCHDO, 2018",https://www.zotero.org/groups/2432820/maris/it...,"Data downloaded from: Oms (2018), Tritium in o...",,...,402.1304,SOURCE FILE NAME: 316N145_7_00398_00001_hy1.nc.,"CCHDO, 2018. CCHDO (CLIVAR and Carbon Hydrogra...",1984,1,"89.373,-31.597",0xE6100000010CF085C954C1983FC0A4703D0AD7575640,89.3725,-31.596667,4
4,594152,1904,Indian Ocean,1,Seawater,402,"CCHDO, 2018",https://www.zotero.org/groups/2432820/maris/it...,"Data downloaded from: Oms (2018), Tritium in o...",,...,402.1303,SOURCE FILE NAME: 33RR20090320_00181_00002_hy1...,"CCHDO, 2018. CCHDO (CLIVAR and Carbon Hydrogra...",1984,1,"109.551,-31.595",0xE6100000010C63EE5A423E983FC02506819543635B40,109.551389,-31.594722,5


In [None]:
df['samplingyear']

0         1995
1         2009
2         2009
3         1995
4         2009
          ... 
818982    2014
818983    2014
818984    2014
818985    2014
818986    2014
Name: samplingyear, Length: 818987, dtype: int64

In [None]:
df.columns

Index(['sample_id', 'area_id', 'areaname', 'samptype_id', 'samptype', 'ref_id',
       'displaytext', 'zoterourl', 'ref_note', 'datbase', 'lab_id', 'lab',
       'latitude', 'longitude', 'begperiod', 'endperiod', 'samplingyear',
       'totdepth', 'sampdepth', 'station', 'samplabcode', 'species_id',
       'taxonname', 'taxonrank', 'biogroup', 'biogroup_id', 'taxondb',
       'taxondbid', 'taxondburl', 'taxonrepname', 'bodypar_id', 'bodypar',
       'sliceup', 'slicedown', 'sedtype_id', 'sedtype', 'sedrepname',
       'nuclide_id', 'nusymbol', 'volume', 'salinity', 'temperatur',
       'filtered', 'filtpore', 'samparea', 'drywt', 'wetwt', 'percentwt',
       'sampmet_id', 'sampmet', 'prepmet_id', 'prepmet', 'drymet_id', 'drymet',
       'counmet_id', 'counmet', 'decayedto', 'detection', 'activity',
       'uncertaint', 'unit_id', 'unit', 'vartype', 'freq', 'rangelow',
       'rangeupp', 'profile', 'transect_id', 'measure_note', 'shapetype_id',
       'profile_id', 'sampnote', 'ref_full

In [None]:
df['filtered'].unique()

array([nan, 'Y', 'N'], dtype=object)

In [None]:
df['detection'] == '='

0         True
1         True
2         True
3         True
4         True
          ... 
818982    True
818983    True
818984    True
818985    True
818986    True
Name: detection, Length: 818987, dtype: bool

In [None]:
df_sw = df[df['samptype'] == 'Seawater']
print(df_sw.shape)
df_sw = df_sw[df_sw['detection'] == '=']
print(df_sw.shape)
df_sw = df_sw[['areaname', 'latitude', 'longitude', 'nusymbol', 'samplingyear', 'activity', 'unit', 'sampdepth']]


(414051, 80)
(295107, 80)


In [None]:
df_sw.head()

Unnamed: 0,areaname,latitude,longitude,nusymbol,samplingyear,activity,unit,sampdepth
0,Indian Ocean,-31.596667,89.3725,3H,1995,0.4352,TU,608.4
1,Indian Ocean,-31.594722,109.551389,3H,2009,0.001,TU,1363.3
2,Indian Ocean,-31.594722,109.551389,3H,2009,0.001,TU,3199.5
3,Indian Ocean,-31.596667,89.3725,3H,1995,0.7237,TU,287.7
4,Indian Ocean,-31.594722,109.551389,3H,2009,0.476,TU,92.9


In [None]:
# depth = -1 reallocated to 0
# df_sw['sampdepth'][df_sw['sampdepth'] == -1] = 0

In [None]:
nuclide_of_interest = ['137Cs', '90Sr', '3H', '239,240Pu']
df_sw = df_sw[df_sw['nusymbol'].isin(nuclide_of_interest)]

In [None]:
df_sw.head()

Unnamed: 0,areaname,latitude,longitude,nusymbol,samplingyear,activity,unit,sampdepth
0,Indian Ocean,-31.596667,89.3725,3H,1995,0.4352,TU,608.4
1,Indian Ocean,-31.594722,109.551389,3H,2009,0.001,TU,1363.3
2,Indian Ocean,-31.594722,109.551389,3H,2009,0.001,TU,3199.5
3,Indian Ocean,-31.596667,89.3725,3H,1995,0.7237,TU,287.7
4,Indian Ocean,-31.594722,109.551389,3H,2009,0.476,TU,92.9


In [None]:
df['unit'].unique()

array(['TU', 'DELTA/mill', 'Bq/m3', 'Bq/kgd', 'Bq/kgw', 'atom/kg',
       'kg/kg', 'NOT AVAILABLE', 'atom/l', 'Bq/kg', 'Bq/m2', 'atom/kgd'],
      dtype=object)

In [None]:
df_sw.groupby(['nusymbol', 'unit']).size()

nusymbol   unit 
137Cs      Bq/m3    94504
239,240Pu  Bq/m3     7331
3H         Bq/m3    86194
90Sr       Bq/m3    16651
dtype: int64

In [None]:
# Conversion TU -> Bq/m3
is_TU = df_sw['unit'] == 'TU'
df_sw.loc[is_TU, 'activity'] = df_sw[is_TU]['activity'] * 119
df_sw.loc[is_TU, 'unit'] = "Bq/m3"

In [None]:
# Conversion Bq/kg -> Bq/m3
is_kg = df_sw['unit'] == 'Bq/kg'
df_sw.loc[is_kg, 'activity'] = df_sw[is_kg]['activity'] / 1025
df_sw.loc[is_kg, 'unit'] = "Bq/m3"

In [None]:
df_sw.head()

Unnamed: 0,areaname,latitude,longitude,nusymbol,samplingyear,activity,unit,sampdepth
0,Indian Ocean,-31.596667,89.3725,3H,1995,51.7888,Bq/m3,608.4
1,Indian Ocean,-31.594722,109.551389,3H,2009,0.119,Bq/m3,1363.3
2,Indian Ocean,-31.594722,109.551389,3H,2009,0.119,Bq/m3,3199.5
3,Indian Ocean,-31.596667,89.3725,3H,1995,86.1203,Bq/m3,287.7
4,Indian Ocean,-31.594722,109.551389,3H,2009,56.644,Bq/m3,92.9


In [None]:
df_sw.to_csv(Path.home() / 'pro/data/maris/all_maris_seawater_202312_lamer_pres.csv', index=None)

In [None]:
df_sw.sampdepth.max()

9738.0

In [None]:
# df.shape

In [None]:
# df.groupby('samptype').size()

In [None]:
# df.groupby('ref_id').size().sort_values(ascending=False)

In [None]:
# df.columns

In [None]:
col = 'detection'
len(df[col].unique())

4

In [None]:
df[col].unique()

array(['=', '<', 'ND', 'DE'], dtype=object)

In [None]:
# df.samptype.unique()

In [None]:
# df.head()

In [None]:
# len(df.ref_id.unique())

In [None]:
# len(df.displaytext.unique())

In [None]:
# len(df.ref_fulltext.unique())

In [None]:
# len(df.zoterourl.unique())

In [None]:
# df.groupby('ref_id').size().sort_values(ascending=False)

In [None]:
# df[df['ref_id'] == 129].zoterourl.iloc[0]

In [None]:
# df[df['ref_id'] == 129].samptype.unique()

In [None]:
# df[df['ref_id'] == 129].ref_fulltext.iloc[0]

In [None]:
# df[df['ref_id'] == 129].displaytext.iloc[0]

In [None]:
# df[df['ref_id'] == 130].zoterourl.iloc[0]

In [None]:
# df[df['ref_id'] == 130].ref_fulltext.iloc[0]

In [None]:
# df[df['ref_id'] == 130].samptype.unique()

In [None]:
# df.columns

In [None]:
# fname_in = '../../_data/exploded/6-kautsky-and-eicke-1982.csv'

## Utils

In [None]:
#| export
def load_data(fname):
    "Load generic MARIS data and return them as individual dataframe by sample type"
    dfs = {}
    df = pd.read_csv(fname)
    for name, group in df.groupby('samptype'):
        key = name.lower().replace(' ', '_')
        dfs[key] = group
    return dfs

def rename_cols(cols):
    "Flatten multiindex columns"
    new_cols = []
    for outer, inner in cols:
        if not inner:
            new_cols.append(outer)
        else:
            if outer == 'unc':
                new_cols.append(inner + '_' + outer)
            if outer == 'value':
                new_cols.append(inner)
    return new_cols

## Load tables

In [None]:
dfs = load_data(fname_in)

# [['samptype_id', 'samptype', 'latitude',
#     'longitude', 'begperiod', 'sampdepth',
#     'nuclide_id', 'nusymbol', 'activity', 'uncertaint', 
#     'detection']]

In [None]:
dfs['seawater'].head()

Unnamed: 0,sample_id,area_id,areaname,samptype_id,samptype,ref_id,displaytext,zoterourl,ref_note,datbase,...,profile_id,sampnote,ref_fulltext,ref_yearpub,ref_sampleTypes,LongLat,displaycoordinates,DisplayLong,DisplayLat,id
0,18090,3319,Black Sea,1,Seawater,3,"Top and Clarke, 1983",https://www.zotero.org/groups/2432820/maris/it...,,,...,,,"Top, Z., Clarke, W.B., 1983. Helium, neon, and...",1983,1,3342.833,0xE6100000010CDFE00B93A96A45400000000000804040,33.0,42.833333,713335
1,18082,3319,Black Sea,1,Seawater,3,"Top and Clarke, 1983",https://www.zotero.org/groups/2432820/maris/it...,,,...,,,"Top, Z., Clarke, W.B., 1983. Helium, neon, and...",1983,1,3342.833,0xE6100000010CDFE00B93A96A45400000000000804040,33.0,42.833333,713336
2,18079,3319,Black Sea,1,Seawater,3,"Top and Clarke, 1983",https://www.zotero.org/groups/2432820/maris/it...,,,...,,,"Top, Z., Clarke, W.B., 1983. Helium, neon, and...",1983,1,3342.833,0xE6100000010CDFE00B93A96A45400000000000804040,33.0,42.833333,713337
3,18076,3319,Black Sea,1,Seawater,3,"Top and Clarke, 1983",https://www.zotero.org/groups/2432820/maris/it...,,,...,,,"Top, Z., Clarke, W.B., 1983. Helium, neon, and...",1983,1,3342.833,0xE6100000010CDFE00B93A96A45400000000000804040,33.0,42.833333,713338
4,18084,3319,Black Sea,1,Seawater,3,"Top and Clarke, 1983",https://www.zotero.org/groups/2432820/maris/it...,,,...,,,"Top, Z., Clarke, W.B., 1983. Helium, neon, and...",1983,1,3342.833,0xE6100000010CDFE00B93A96A45400000000000804040,33.0,42.833333,713339


In [None]:
dfs['seawater'].columns

Index(['sample_id', 'area_id', 'areaname', 'samptype_id', 'samptype', 'ref_id',
       'displaytext', 'zoterourl', 'ref_note', 'datbase', 'lab_id', 'lab',
       'latitude', 'longitude', 'begperiod', 'endperiod', 'samplingyear',
       'totdepth', 'sampdepth', 'station', 'samplabcode', 'species_id',
       'taxonname', 'taxonrank', 'biogroup', 'taxondb', 'taxondbid',
       'taxondburl', 'taxonrepname', 'bodypar_id', 'bodypar', 'sliceup',
       'slicedown', 'sedtype_id', 'sedtype', 'sedrepname', 'nuclide_id',
       'nusymbol', 'volume', 'salinity', 'temperatur', 'filtered', 'filtpore',
       'samparea', 'drywt', 'wetwt', 'percentwt', 'sampmet_id', 'sampmet',
       'prepmet_id', 'prepmet', 'drymet_id', 'drymet', 'counmet_id', 'counmet',
       'decayedto', 'detection', 'activity', 'uncertaint', 'unit_id', 'unit',
       'vartype', 'freq', 'rangelow', 'rangeupp', 'profile', 'transect_id',
       'measure_note', 'shapetype_id', 'profile_id', 'sampnote',
       'ref_fulltext', 'ref_yea

In [None]:
dfs['seawater'].lab_id

0     44
1     44
2     44
3     44
4     44
5     44
6     44
7     44
8     44
9     44
10    44
11    44
12    44
13    44
14    44
15    44
16    44
17    44
Name: lab_id, dtype: int64

## Data transformation pipeline

### Normalize nuclide names

In [None]:
# | export
class RemapRdnNameCB(Callback):
    "Remap to MARIS radionuclide names"
    def __call__(self, tfm):
        # lut = get_nuclides_lut()
        lut = NUCLIDES_LUT
        for k in tfm.dfs.keys():
            tfm.dfs[k]['nusymbol'].replace(lut, inplace=True)

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[RemapRdnNameCB()])

print(tfm()['seawater']['nusymbol'].unique())

['h3']


### Parse time

In [None]:
#| export
class ParseTimeCB(Callback):
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k]['time'] = pd.to_datetime(tfm.dfs[k].begperiod, 
                                                format='%Y-%m-%d %H:%M:%S.%f')

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[RemapRdnNameCB(),
                            ParseTimeCB()])

print(tfm()['seawater']['time'][:5])

0   1975-04-01
1   1975-04-01
2   1975-04-01
3   1975-04-01
4   1975-04-01
Name: time, dtype: datetime64[ns]


### Normalize uncertainty units

In [None]:
dfs['seawater']['uncertaint']

0     0.4
1     0.2
2     0.3
3     0.4
4     0.3
5     0.4
6     0.2
7     0.2
8     0.2
9     0.2
10    0.2
11    0.3
12    0.2
13    0.3
14    0.4
15    0.3
16    0.3
17    0.5
Name: uncertaint, dtype: float64

In [None]:
# Make measurement and uncertainty units consistent
# def fix_units(df, meas_col, unc_col):
#     return df.apply(lambda row: row[unc_col] * row[meas_col]/100, axis=1)

In [None]:
# Columns of interest
# coi_units_unc = [('seawater', 'VALUE_Bq/m³', 'ERROR%_m³'),
#                  ('biota', 'VALUE_Bq/kg', 'ERROR%'),
#                  ('sediment', 'VALUE_Bq/kg', 'ERROR%_kg')]

In [None]:
# class NormalizeUncUnitCB(Callback):
#     "Convert uncertainty from % to activity unit"

#     def __init__(self, coi=coi_units_unc): fc.store_attr()

#     def __call__(self, tfm):
#         for grp, val, unc in self.coi:
#             tfm.dfs[grp][unc] = self.fix_units(tfm.dfs[grp], val, unc)

#     def fix_units(self, df, meas_col, unc_col):
#         return df.apply(lambda row: row[unc_col] * row[meas_col]/100, axis=1)

In [None]:
# dfs = load_data(fname_in)
# tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
#                             RemapRdnNameCB(),
#                             ParseTimeCB(),
#                             NormalizeUncUnitCB()])

# print(tfm()['seawater'][['VALUE_Bq/m³', 'ERROR%_m³']][:5])

### Lookup biota species

In [None]:
# df_rubin = pd.read_csv(Path(fname_in) / 'RUBIN_NAME.csv')
# df_rubin.head(5)

In [None]:
# def get_species_lut(fname_in, overwrite=False):
#     fname_lut = 'species_helcom.pkl'
#     config_path = BASE_PATH / 'lut' / fname_lut
#     repo_path = Path('../files/lut') / fname_lut

#     if overwrite or (not config_path.exists()):
#         df = pd.read_csv(Path(fname_in) / 'RUBIN_NAME.csv')
#         lut = {}
        
#         for _, row in tqdm(df[['RUBIN', 'SCIENTIFIC NAME']].iterrows(), total=df.shape[0]):
#             res = match_worms(row['SCIENTIFIC NAME'])
#             if (res == -1):
#                 print(f"No match for {row['RUBIN']} ({row['SCIENTIFIC NAME']})")
#                 aphia_id = -1
#             else:
#                 if len(res[0]) > 1:
#                     print(
#                         f"Several matches for {row['RUBIN']} ({row['SCIENTIFIC NAME']})")
#                     print(res)
#                 aphia_id = res[0][0]['AphiaID']

#             lut[row['RUBIN']] = aphia_id
#         fc.save_pickle(config_path, lut)
#         fc.save_pickle(repo_path, lut)
#     else:
#         lut = fc.load_pickle(config_path)
        
#     return lut

In [None]:
# species_lut = get_species_lut(fname_in, overwrite=False)

In [None]:
# class LookupBiotaSpeciesCB(Callback):
#     'Match "RUBIN" species with WorMS db taxon name (AphiaID)'
#     def __init__(self, fn_lut): fc.store_attr()
#     def __call__(self, tfm):
#         lut = self.fn_lut()
#         tfm.dfs['biota']['species_id'] = tfm.dfs['biota']['RUBIN'].apply(
#             lambda x: lut[x.strip()])

In [None]:
# dfs = load_data(fname_in)
# tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
#                             RemapRdnNameCB(),
#                             ParseTimeCB(),
#                             LookupBiotaSpeciesCB(partial(get_species_lut, fname_in))])

# print(tfm()['biota'][['RUBIN', 'species_id']][:5])

### Lookup biota tissues

In [None]:
# dfs['biota']['TISSUE'].unique()

### Rename columns

In [None]:
for c in dfs['seawater'].columns:
    print(c)

sample_id
area_id
areaname
samptype_id
samptype
ref_id
displaytext
zoterourl
ref_note
datbase
lab_id
lab
latitude
longitude
begperiod
endperiod
samplingyear
totdepth
sampdepth
station
samplabcode
species_id
taxonname
taxonrank
biogroup
taxondb
taxondbid
taxondburl
taxonrepname
bodypar_id
bodypar
sliceup
slicedown
sedtype_id
sedtype
sedrepname
nuclide_id
nusymbol
volume
salinity
temperatur
filtered
filtpore
samparea
drywt
wetwt
percentwt
sampmet_id
sampmet
prepmet_id
prepmet
drymet_id
drymet
counmet_id
counmet
decayedto
detection
activity
uncertaint
unit_id
unit
vartype
freq
rangelow
rangeupp
profile
transect_id
measure_note
shapetype_id
profile_id
sampnote
ref_fulltext
ref_yearpub
ref_sampleTypes
LongLat
displaycoordinates
DisplayLong
DisplayLat
id
time


In [None]:
#| export

# Define columns of interest by sample type
# coi_grp = {'seawater': ['NUCLIDE', 'VALUE_Bq/m³', 'ERROR%_m³', 'time',
#                         'TDEPTH', 'LATITUDE (dddddd)', 'LONGITUDE (dddddd)'],
#            'sediment': ['NUCLIDE', 'VALUE_Bq/kg', 'ERROR%_kg', 'time',
#                         'TDEPTH', 'LATITUDE (dddddd)', 'LONGITUDE (dddddd)',
#                         'SEDI'],
#            'biota': ['NUCLIDE', 'VALUE_Bq/kg', 'ERROR%', 'time',
#                      'SDEPTH', 'LATITUDE ddmmmm', 'LONGITUDE ddmmmm',
#                      'species_id', 'TISSUE']}

common_cols = ['area_id', 'lab_id', 'latitude', 'longitude', 'time',
               'totdepth', 'sampdepth', 'volume', 'salinity',
               'temperatur', 'nusymbol', 'activity', 'uncertaint', 'transect_id']

coi_grp = {'seawater': common_cols,
           'biota': common_cols + ['biogroup', 'taxondbid', 'bodypar_id',
                                   'species_id'],
           'sediment': common_cols + ['sedtype_id']
           }

In [None]:
#| export
# Define column names renaming rules
renaming_rules = {
    'latitude': 'lat',
    'longitude': 'lon',
    'totdepth': 'tot_depth',
    'sampdepth': 'depth',
    'temperatur': 'temperature',
    'activity': 'value',
    'uncertaint': 'unc',
    
    # 'VALUE_Bq/m³': 'value',
    # 'VALUE_Bq/kg': 'value',
    # 'ERROR%_m³': 'unc',
    # 'ERROR%_kg': 'unc',
    # 'ERROR%': 'unc',
    # 'TDEPTH': 'depth',
    # 'SDEPTH': 'depth',
    # 'LATITUDE (dddddd)': 'lat',
    # 'LATITUDE ddmmmm': 'lat',
    # 'LONGITUDE (dddddd)': 'lon',
    # 'LONGITUDE ddmmmm': 'lon',
    # # group specific
    # 'TISSUE': 'body_part',
    # 'SEDI': 'sed_type'
}


In [None]:
#| export
class RenameColumnCB(Callback):
    def __init__(self,
                 coi=coi_grp,
                 renaming_rules=renaming_rules):
        fc.store_attr()

    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            # Select cols of interest
            tfm.dfs[k] = tfm.dfs[k].loc[:, self.coi[k]]

            # Rename cols
            tfm.dfs[k].rename(columns=self.renaming_rules, inplace=True)

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[RemapRdnNameCB(),
                            ParseTimeCB(),
                            RenameColumnCB()])
print(tfm()['seawater'].head(5))

   area_id  lab_id        lat   lon       time  tot_depth   depth  volume  \
0     3319      44  42.833333  33.0 1975-04-01        NaN  1358.0     NaN   
1     3319      44  42.833333  33.0 1975-04-01        NaN   134.0     NaN   
2     3319      44  42.833333  33.0 1975-04-01        NaN   105.0     NaN   
3     3319      44  42.833333  33.0 1975-04-01        NaN    69.0     NaN   
4     3319      44  42.833333  33.0 1975-04-01        NaN   283.0     NaN   

   salinity  temperature nusymbol  value  unc  transect_id  
0     22.34          8.8       h3   -0.1  0.4          NaN  
1     21.09          8.5       h3   12.1  0.2          NaN  
2     20.73          8.4       h3   18.6  0.3          NaN  
3     19.65          7.9       h3   42.5  0.4          NaN  
4     21.74          8.8       h3    3.4  0.3          NaN  


### Reshape: long to wide

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[RemapRdnNameCB(),
                            ParseTimeCB(),
                            RenameColumnCB()])
df_debug = tfm()['seawater']
df_debug.dropna(axis=1, how='all', inplace=True)
df_debug.head()

Unnamed: 0,area_id,lab_id,lat,lon,time,depth,salinity,temperature,nusymbol,value,unc
0,3319,44,42.833333,33.0,1975-04-01,1358.0,22.34,8.8,h3,-0.1,0.4
1,3319,44,42.833333,33.0,1975-04-01,134.0,21.09,8.5,h3,12.1,0.2
2,3319,44,42.833333,33.0,1975-04-01,105.0,20.73,8.4,h3,18.6,0.3
3,3319,44,42.833333,33.0,1975-04-01,69.0,19.65,7.9,h3,42.5,0.4
4,3319,44,42.833333,33.0,1975-04-01,283.0,21.74,8.8,h3,3.4,0.3


In [None]:
df_debug.depth.iloc[0]

1358.0

### UUID composite key

In [None]:
import hashlib
from marisco.configs import get_cfgs
from cftime import date2num
import sys

def format_time(x): return date2num(x, units=get_cfgs('units')['time'])

ref_id = 3
samptype_id = 1
lat = 42.833333
lon = 33.0
time = format_time(df_debug.time.iloc[0]); time
depth = 1358.0
time_ds = 165842400 # time dataset/version

composite_key = '-'.join([str(i) for i in [ref_id, time_ds, samptype_id, lat, lon, time, depth]])
key_enc_md5 = hashlib.md5(composite_key.encode()).hexdigest()
key_enc_sha256 = hashlib.sha256(composite_key.encode()).hexdigest()
print(f'composite key: {composite_key} \nmd5: {key_enc_md5} \nsha256: {key_enc_sha256}')

max_n_rows = 109506
print('MD5:',   max_n_rows * sys.getsizeof(key_enc_md5) / 10**6, 'MB')
print('sha256:', max_n_rows * sys.getsizeof(key_enc_sha256) / 10**6, 'MB')

# comments:
#   - need to be documented (time format, date2num start date, ...)
#   - irreversible

composite key: 3-165842400-1-42.833333-33.0-165542400-1358.0 
md5: 62d2f8a6db90c22611885d80bae4f4b0 
sha256: d8ce70968125f9fb98b97a47002c0f9b5ecd758061938d54beec3cd8a9e95018
MD5: 8.869986 MB
sha256: 12.374178 MB


In [None]:
# cols = ['nusymbol']
# vals = ['value', 'unc']
# idx = list(set(df_debug.columns) - set(cols + vals))  # All others
# # idx = ['time', 'depth', 'lat', 'lon']

In [None]:
# df_debug.pivot_table(index=idx, columns=cols, values=vals).reset_index()


In [None]:
#| export
class ReshapeLongToWide(Callback):
    def __init__(self): fc.store_attr()

    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            cols = ['nusymbol']
            vals = ['value', 'unc']
            idx = list(set(tfm.dfs[k].columns) -
                       set(cols + vals))  # All others

            tfm.dfs[k] = tfm.dfs[k].pivot_table(index=idx,
                                                columns=cols,
                                                values=vals).reset_index()

            # Flatten cols name
            tfm.dfs[k].columns = rename_cols(tfm.dfs[k].columns)

            # Set index
            tfm.dfs[k].index.name = 'sample'

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[RemapRdnNameCB(),
                            ParseTimeCB(),
                            RenameColumnCB(),
                            ReshapeLongToWide()])

print(tfm()['seawater'].head(5))

Empty DataFrame
Columns: [area_id, lat, transect_id, tot_depth, lab_id, time, lon, salinity, volume, depth, temp]
Index: []


In [None]:
df_debug.head()

Unnamed: 0,area_id,lab_id,lat,lon,time,depth,salinity,temp,nusymbol,value,unc
0,3319,44,42.833333,33.0,1975-04-01,1358.0,22.34,8.8,h3,-0.1,0.4
1,3319,44,42.833333,33.0,1975-04-01,134.0,21.09,8.5,h3,12.1,0.2
2,3319,44,42.833333,33.0,1975-04-01,105.0,20.73,8.4,h3,18.6,0.3
3,3319,44,42.833333,33.0,1975-04-01,69.0,19.65,7.9,h3,42.5,0.4
4,3319,44,42.833333,33.0,1975-04-01,283.0,21.74,8.8,h3,3.4,0.3


In [None]:
import pandas as pd

# Example DataFrame
# data = {
#     'lon': [1.2, 2.2],
#     'lat': [2.3, 3.3],
#     'time': [1975, 1977],
#     'depth': [2, 3],
#     'salinity': [0.1, 1.1],
#     'nusymbol': ['pu283', 'pu285'],
#     'value': [0.12, 0.14],
#     'unc': [0.01, 0.02]
# }
# df = pd.DataFrame(data)

# Pivot for both 'value' and 'unc'
pivot_combined = df_debug.pivot_table(index=['lon', 'lat', 'time', 'depth'], columns='nusymbol', values=['value', 'unc'], aggfunc='first')

# Flatten the multi-level columns
pivot_combined.columns = ['_'.join(col).strip() for col in pivot_combined.columns.values]

In [None]:
pivot_combined.columns.values

array(['unc_h3', 'value_h3'], dtype=object)

In [None]:
pivot_combined.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,unc_h3,value_h3
lon,lat,time,depth,Unnamed: 4_level_1,Unnamed: 5_level_1
33.0,42.833333,1975-04-01,10.0,0.4,67.2
33.0,42.833333,1975-04-01,69.0,0.4,42.5
33.0,42.833333,1975-04-01,89.0,0.3,27.6
33.0,42.833333,1975-04-01,95.0,0.2,25.1
33.0,42.833333,1975-04-01,105.0,0.3,18.6


In [None]:
df_debug[['lon', 'lat', 'time', 'depth', 'salinity']]
# df_debug[['lon', 'lat', 'time', 'depth']]

Unnamed: 0,lon,lat,time,depth,salinity
0,33.0,42.833333,1975-04-01,1358.0,22.34
1,33.0,42.833333,1975-04-01,134.0,21.09
2,33.0,42.833333,1975-04-01,105.0,20.73
3,33.0,42.833333,1975-04-01,69.0,19.65
4,33.0,42.833333,1975-04-01,283.0,21.74
5,33.0,42.833333,1975-04-01,10.0,18.3
6,33.0,42.833333,1975-04-01,482.0,22.05
7,33.0,42.833333,1975-04-01,95.0,20.6
8,33.0,42.833333,1975-04-01,119.0,20.91
9,33.0,42.833333,1975-04-01,387.0,21.95


In [None]:

# Merge with original DataFrame to include 'salinity'
df_final = df_debug[['lon', 'lat', 'time', 'depth', 'salinity']].drop_duplicates().merge(pivot_combined, on=['lon', 'lat', 'time', 'depth'])

# Reset index to create 'sample' column
df_final.reset_index(drop=True, inplace=True)
df_final['sample'] = df_final.index

In [None]:
df_final.head()

Unnamed: 0,lon,lat,time,depth,salinity,unc_h3,value_h3,sample
0,33.0,42.833333,1975-04-01,1358.0,22.34,0.4,-0.1,0
1,33.0,42.833333,1975-04-01,134.0,21.09,0.2,12.1,1
2,33.0,42.833333,1975-04-01,105.0,20.73,0.3,18.6,2
3,33.0,42.833333,1975-04-01,69.0,19.65,0.4,42.5,3
4,33.0,42.833333,1975-04-01,283.0,21.74,0.3,3.4,4


In [None]:

# Reordering columns
columns_order = ['sample', 'lon', 'lat', 'time', 'depth', 'salinity'] + [col for col in df_final.columns if col.startswith('pu')]
df_final = df_final[columns_order]

# Display the final DataFrame
print(df_final)


In [None]:
# import pandas as pd

# # Example DataFrame
# # data = {
# #     'lon': [1.2, 2.2],
# #     'lat': [2.3, 3.3],
# #     'time': [1975, 1977],
# #     'depth': [2, 3],
# #     'salinity': [0.1, 1.1],
# #     'nusymbol': ['pu283', 'pu285'],
# #     'value': [0.12, 0.14],
# #     'unc': [0.01, 0.02]
# # }
# # df = pd.DataFrame(data)

# # Pivot for 'value'
# pivot_value = df_debug.pivot_table(index=['lon', 'lat', 'time', 'depth'], columns='nusymbol', values='value', aggfunc='first')
# pivot_value.columns = [f'{col}_value' for col in pivot_value.columns]

# # Pivot for 'unc'
# pivot_unc = df.pivot_table(index=['lon', 'lat', 'time', 'depth'], columns='nusymbol', values='unc', aggfunc='first')
# pivot_unc.columns = [f'{col}_unc' for col in pivot_unc.columns]

# # Merge the pivot tables
# pivot_merged = pivot_value.merge(pivot_unc, left_index=True, right_index=True)

# # Merge with original DataFrame to include 'salinity'
# df_final = df.drop(['nusymbol', 'value', 'unc'], axis=1).drop_duplicates().merge(pivot_merged, on=['lon', 'lat', 'time', 'depth'])

# # Reset index to create 'sample' column
# df_final.reset_index(drop=True, inplace=True)
# df_final['sample'] = df_final.index

# # Reordering columns
# columns_order = ['sample', 'lon', 'lat', 'time', 'depth', 'salinity'] + [col for col in df_final.columns if col.startswith('pu')]
# df_final = df_final[columns_order]

# # Display the final DataFrame
# print(df_final)


### Encode time (seconds since ...)

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            NormalizeUncUnitCB(),
                            LookupBiotaSpeciesCB(partial(get_species_lut, fname_in)),
                            RenameColumnCB(),
                            ReshapeLongToWide(),
                            EncodeTimeCB()])

print(tfm()['seawater'].head(5))


        depth      lon        time      lat  ag110m_unc  am241_unc  ba140_unc  \
sample                                                                          
0         0.0  10.3167   516153600  54.5000         NaN        NaN        NaN   
1         0.0  11.0783   515980800  54.3500         NaN        NaN        NaN   
2         0.0  11.9354  1252540800  57.2346         NaN        NaN        NaN   
3         0.0  11.9354  1307664000  57.2346         NaN        NaN        NaN   
4         0.0  11.9354  1316131200  57.2346         NaN        NaN        NaN   

        ce144_unc  cm242_unc  cm243_244_tot_unc  ...  pu240  ru103  ru106  \
sample                                           ...                        
0             NaN        NaN                NaN  ...    NaN    NaN    NaN   
1             NaN        NaN                NaN  ...    NaN    NaN    NaN   
2             NaN        NaN                NaN  ...    NaN    NaN    NaN   
3             NaN        NaN                NaN

### Sanitize coordinates

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            NormalizeUncUnitCB(),
                            LookupBiotaSpeciesCB(partial(get_species_lut, fname_in)),
                            RenameColumnCB(),
                            ReshapeLongToWide(),
                            EncodeTimeCB(),
                            SanitizeLonLatCB()])

print(tfm()['seawater'].head(5))


        depth      lon        time      lat  ag110m_unc  am241_unc  ba140_unc  \
sample                                                                          
0         0.0  10.3167   516153600  54.5000         NaN        NaN        NaN   
1         0.0  11.0783   515980800  54.3500         NaN        NaN        NaN   
2         0.0  11.9354  1252540800  57.2346         NaN        NaN        NaN   
3         0.0  11.9354  1307664000  57.2346         NaN        NaN        NaN   
4         0.0  11.9354  1316131200  57.2346         NaN        NaN        NaN   

        ce144_unc  cm242_unc  cm243_244_tot_unc  ...  pu240  ru103  ru106  \
sample                                           ...                        
0             NaN        NaN                NaN  ...    NaN    NaN    NaN   
1             NaN        NaN                NaN  ...    NaN    NaN    NaN   
2             NaN        NaN                NaN  ...    NaN    NaN    NaN   
3             NaN        NaN                NaN

## Encode to NetCDF

In [None]:
dfs = load_data(fname_in)
tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                            RemapRdnNameCB(),
                            ParseTimeCB(),
                            NormalizeUncUnitCB(),
                            LookupBiotaSpeciesCB(partial(get_species_lut, fname_in)),
                            RenameColumnCB(),
                            ReshapeLongToWide(),
                            EncodeTimeCB(),
                            SanitizeLonLatCB()])

dfs_tfm = tfm()


In [None]:
dfs_tfm

{'seawater':         depth      lon        time      lat  ag110m_unc  am241_unc  ba140_unc  \
 sample                                                                          
 0         0.0  10.3167   516153600  54.5000         NaN        NaN        NaN   
 1         0.0  11.0783   515980800  54.3500         NaN        NaN        NaN   
 2         0.0  11.9354  1252540800  57.2346         NaN        NaN        NaN   
 3         0.0  11.9354  1307664000  57.2346         NaN        NaN        NaN   
 4         0.0  11.9354  1316131200  57.2346         NaN        NaN        NaN   
 ...       ...      ...         ...      ...         ...        ...        ...   
 4581    459.0  18.2333   613440000  58.5833         NaN        NaN        NaN   
 4582    460.0  18.2333   743472000  58.5833         NaN        NaN        NaN   
 4583    460.0  18.2333   769478400  58.5800         NaN   0.000252        NaN   
 4584    460.0  18.2317   965260800  58.5833         NaN        NaN        NaN   
 458

In [None]:
tfm.logs

['Convert nuclide names to lowercase & strip any trailing space(s)',
 'Remap to MARIS radionuclide names',
 'Convert uncertainty from % to activity unit',
 'Match "RUBIN" species with WorMS db taxon name (AphiaID)',
 'Encode time as `int` representing seconds since xxx',
 'Drop row when both longitude & latitude equal 0']

### Feed global attributes

In [None]:
#| export
kw = ['oceanography', 'Earth Science > Oceans > Ocean Chemistry> Radionuclides',
      'Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure',
      'Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments',
      'Earth Science > Oceans > Ocean Chemistry, Earth Science > Oceans > Sea Ice > Isotopes',
      'Earth Science > Oceans > Water Quality > Ocean Contaminants',
      'Earth Science > Biological Classification > Animals/Vertebrates > Fish',
      'Earth Science > Biosphere > Ecosystems > Marine Ecosystems',
      'Earth Science > Biological Classification > Animals/Invertebrates > Mollusks',
      'Earth Science > Biological Classification > Animals/Invertebrates > Arthropods > Crustaceans',
      'Earth Science > Biological Classification > Plants > Macroalgae (Seaweeds)']


In [None]:
#| export
def get_attrs(tfm, zotero_key='26VMZZ2Q', kw=kw):
    return GlobAttrsFeeder(tfm.dfs, cbs=[BboxCB(),
                                    DepthRangeCB(),
                                    TimeRangeCB(),
                                    ZoteroCB(zotero_key),
                                    KeyValuePairCB('keywords', ', '.join(kw)),
                                    KeyValuePairCB('publisher_postprocess_logs', ', '.join(tfm.logs))])()

In [None]:
get_attrs(tfm, zotero_key='26VMZZ2Q', kw=kw)


{'id': '',
 'title': 'Environmental database - Helsinki Commission Monitoring of Radioactive Substances',
 'summary': 'MORS Environment database has been used to collate data resulting from monitoring of environmental radioactivity in the Baltic Sea based on HELCOM Recommendation 26/3.\n\nThe database is structured according to HELCOM Guidelines on Monitoring of Radioactive Substances (https://www.helcom.fi/wp-content/uploads/2019/08/Guidelines-for-Monitoring-of-Radioactive-Substances.pdf), which specifies reporting format, database structure, data types and obligatory parameters used for reporting data under Recommendation 26/3.\n\nThe database is updated and quality assured annually by HELCOM MORS EG.',
 'keywords': 'oceanography, Earth Science > Oceans > Ocean Chemistry> Radionuclides, Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure, Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments, Earth Scienc

### Encoding

In [None]:
#| export
def units_fn(grp_name):
    lut = {'seawater': 'Bq/m³',
           'sediment': 'Bq/kg',
           'biota': 'Bq/kg'}
    return lut[grp_name]


In [None]:
to_netcdf(dfs_tfm, NC_TPL_PATH, fname_out,
          get_attrs(tfm, zotero_key='26VMZZ2Q', kw=kw),
          units_fn)


% of discarded data for grp seawater: 0.0
% of discarded data for grp sediment: 0.0
% of discarded data for grp biota: 0.0


In [None]:
#| export
def encode(fname_in, fname_out, nc_tpl_path):
    dfs = load_data(fname_in)
    tfm = Transformer(dfs, cbs=[LowerStripRdnNameCB(),
                                RemapRdnNameCB(),
                                ParseTimeCB(),
                                NormalizeUncUnitCB(),
                                LookupBiotaSpeciesCB(partial(get_species_lut, fname_in)),
                                RenameColumnCB(),
                                ReshapeLongToWide(),
                                EncodeTimeCB(),
                                SanitizeLonLatCB()])
    
    dfs_tfm = tfm()
    attrs = get_attrs(tfm, zotero_key='26VMZZ2Q', kw=kw)
    to_netcdf(dfs_tfm, nc_tpl_path, fname_out, attrs, units_fn)

In [None]:
encode(fname_in, fname_out, NC_TPL_PATH)

% of discarded data for grp seawater: 0.0
% of discarded data for grp sediment: 0.0
% of discarded data for grp biota: 0.0


## Testing enum type

- how to copy enum type from src: https://chat.openai.com/share/5853317b-e102-427f-ba72-3fc6445f4368

In [None]:
from netCDF4 import Dataset
import numpy as np

fname = Path('../../_data/nc') / 'clouds.nc'
nc = Dataset(fname,'w')
# python dict with allowed values and their names.
enum_dict = {'Altocumulus': 7, 'Missing': 255, 
             'Stratus': 2, 'Clear': 0, 'Nimbostratus': 6, 'Cumulus': 4, 'Altostratus': 5,
             'Cumulonimbus': 1, 'Stratocumulus': 3}
# create the Enum type called 'cloud_t'.
cloud_type = nc.createEnumType(np.uint8,'cloud_t',enum_dict)
# print(cloud_type)
# nc.close()

time = nc.createDimension('time', None)

# create a 1d variable of type 'cloud_type'.
# The fill_value is set to the 'Missing' named value.
cloud_var = nc.createVariable('primary_cloud', cloud_type, 'time',
                              fill_value=enum_dict['Missing'])

# write some data to the variable.
cloud_var[:] = [enum_dict[k] for k in ['Clear', 'Stratus', 'Cumulus',
                                       'Missing', 'Cumulonimbus']]
nc.close()

In [None]:

# reopen the file, read the data.
nc = Dataset(fname)
cloud_var = nc.variables['primary_cloud']
print(cloud_var)
# <class 'netCDF4._netCDF4.Variable'>
# enum primary_cloud(time)
#     _FillValue: 255
# enum data type: uint8
# unlimited dimensions: time
# current shape = (5,)

print(cloud_var.datatype.enum_dict)
# {'Altocumulus': 7, 'Missing': 255, 'Stratus': 2, 'Clear': 0, 'Nimbostratus': 6, 'Cumulus': 4, 'Altostratus': 5, 'Cumulonimbus': 1, 'Stratocumulus': 3}

print(cloud_var[:])
# [0 2 4 -- 1]

nc.close()


In [None]:
time = nc.createDimension('time', None)

# create a 1d variable of type 'cloud_type'.
# The fill_value is set to the 'Missing' named value.
cloud_var = nc.createVariable('primary_cloud', cloud_type, 'time',
                              fill_value=enum_dict['Missing'])

# write some data to the variable.
cloud_var[:] = [enum_dict[k] for k in ['Clear', 'Stratus', 'Cumulus',
                                       'Missing', 'Cumulonimbus']]
nc.close()

# reopen the file, read the data.
nc = Dataset(fname)
cloud_var = nc.variables['primary_cloud']
print(cloud_var)
# <class 'netCDF4._netCDF4.Variable'>
# enum primary_cloud(time)
#     _FillValue: 255
# enum data type: uint8
# unlimited dimensions: time
# current shape = (5,)

print(cloud_var.datatype.enum_dict)
# {'Altocumulus': 7, 'Missing': 255, 'Stratus': 2, 'Clear': 0, 'Nimbostratus': 6, 'Cumulus': 4, 'Altostratus': 5, 'Cumulonimbus': 1, 'Stratocumulus': 3}

print(cloud_var[:])
# [0 2 4 -- 1]

nc.close()


RuntimeError: NetCDF: Not a valid ID