In [None]:
#| default_exp decoders

# Decoders
> Various utilities to decode MARIS dataset from `NetCDF`.

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#| export
from pathlib import Path
from netCDF4 import Dataset
import pandas as pd
import numpy as np
from fastcore.basics import patch, store_attr
import fastcore.all as fc
from typing import Dict, Callable

from marisco.configs import (
    NC_DTYPES, 
    NC_VARS, 
    OR_VARS,
    NC_DIM,
    NC_GROUPS,
    SMP_TYPE_LUT,
    lut_path, 
    Enums,
    nc_tpl_path,
    get_time_units
)

from marisco.callbacks import (
    DecodeTimeCB
    )

## NetCDF Utilities

NetCDF to a dictionary of Data Frames

In [None]:
#| exports
def nc_to_dfs(
    fname: str # Path to NetCDF file
    ) -> dict: # Dictionary with group names as keys and pandas DataFrames as values
    "Convert a NetCDF (with groups) file to a dictionary of dataframes."
    dfs = {}
    
    with Dataset(fname, 'r') as nc:
        # Process each group in the NetCDF file
        for group_name in nc.groups:
            group = nc.groups[group_name]
            
            # Get all variables in the group
            data = {}
            for var_name in group.variables:
                # Skip dimension variables (like 'id')
                if var_name not in group.dimensions:
                    data[var_name] = group.variables[var_name][:]
            
            # Convert to DataFrame
            df = pd.DataFrame(data)
            
            # Convert time from seconds since epoch if present
            if 'time' in df.columns:
                df['time'] = pd.to_datetime(df['time'], unit='s')
                
            dfs[group_name.upper()] = df
    
    return dfs

Example usage:

In [None]:
#| eval: false
# fname = Path('../../_data/output/190-geotraces-2021.nc')
fname = Path('../../_data/output/tepco.nc')

dfs = nc_to_dfs(fname)

for grp, df in dfs.items():
    print('group:', grp)
    print(f'shape: {df.shape}')
    print(df.head(), '\n')

group: SEAWATER
shape: (21477, 49)
   sample         lon    lat                time  h3  h3_dl  mn54  mn54_dl  \
0       0  141.029999  37.32 2011-03-21 23:15:00 NaN    NaN   NaN      NaN   
1       1  141.029999  37.32 2011-03-22 14:28:00 NaN    NaN   NaN      NaN   
2       2  141.029999  37.32 2011-03-23 13:51:00 NaN    NaN   NaN      NaN   
3       3  141.029999  37.32 2011-03-24 09:30:00 NaN    NaN   NaN      NaN   
4       4  141.029999  37.32 2011-03-25 10:00:00 NaN    NaN   NaN      NaN   

   co58  co58_dl  ...  te132  te132_dl   i132  i132_dl  cs136  cs136_dl  \
0   5.7      7.6  ...    NaN       NaN  160.0     44.0    6.7       4.7   
1   NaN     15.0  ...    NaN       NaN    NaN     88.0    NaN       7.8   
2   NaN      NaN  ...    NaN       NaN  200.0     58.0    NaN       NaN   
3   NaN      NaN  ...    NaN       NaN  120.0     88.0   68.0      49.0   
4   NaN      NaN  ...   13.0       7.4   58.0     22.0    4.4       3.2   

   tbeta  tbeta_dl  talpha  talpha_dl  
0    

Return properties of the NetCDF file

In [None]:
#| exports
def get_netcdf_properties(file_path: str) -> dict:
    """
    Retrieve general properties of a NetCDF file.

    Parameters:
    file_path (str): Path to the NetCDF file.

    Returns:
    dict: A dictionary containing file properties such as size, format, and dimensions.
    """
    properties = {}
    
    file = Path(file_path)
    
    if not file.exists():
        print(f'File not found: {file_path}')
        return properties

    # Get file size
    properties['file_size_bytes'] = file.stat().st_size
    
    # Open the NetCDF file
    with Dataset(file_path, 'r') as nc:
        # Get file format
        properties['file_format'] = nc.file_format

        # Get groups
        properties['groups'] = list(nc.groups.keys())
        
        # Get global attributes
        properties['global_attributes'] = {attr: nc.getncattr(attr) for attr in nc.ncattrs()}
    
    return properties

Example usage:

In [None]:
#| eval: false
# fname = Path('../files/nc/encoding-test.nc')
# fname = Path('../../_data/output/dump/100-HELCOM-MORS-2018.nc')
#fname = Path('../../_data/output/190-geotraces-2021.nc')
fname = Path('../../_data/output/100-HELCOM-MORS-2024.nc')

properties = get_netcdf_properties(fname)

for key, val in properties.items():
    if isinstance(val, dict):
        print(f"{key}:")
        for sub_key, sub_val in val.items():
            print(f"  {sub_key}: {sub_val}")
    else:
        print(f"{key}: {val}")

file_size_bytes: 864768
file_format: NETCDF4
groups: ['biota', 'seawater', 'sediment']
global_attributes:
  id: TBD
  title: Environmental database - Helsinki Commission Monitoring of Radioactive Substances
  summary: MORS Environment database has been used to collate data resulting from monitoring of environmental radioactivity in the Baltic Sea based on HELCOM Recommendation 26/3.

The database is structured according to HELCOM Guidelines on Monitoring of Radioactive Substances (https://www.helcom.fi/wp-content/uploads/2019/08/Guidelines-for-Monitoring-of-Radioactive-Substances.pdf), which specifies reporting format, database structure, data types and obligatory parameters used for reporting data under Recommendation 26/3.

The database is updated and quality assured annually by HELCOM MORS EG.
  keywords: oceanography, Earth Science > Oceans > Ocean Chemistry> Radionuclides, Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure, Earth Science > Oceans

Return group properties of the NetCDF file

In [None]:
#| exports
def get_netcdf_group_properties(file_path: str) -> dict:
    """
    Retrieve properties of each group in a NetCDF file, including dimension sizes.

    Parameters:
    file_path (str): Path to the NetCDF file.

    Returns:
    dict: A dictionary containing properties of each group such as variables, dimensions with sizes, and attributes.
    """
    group_properties = {}

    file = Path(file_path)

    if not file.exists():
        print(f'File not found: {file_path}')
        return group_properties

    with Dataset(file_path, 'r') as nc:
        # Iterate over each group in the NetCDF file
        for group_name, group in nc.groups.items():
            # Get dimensions with their sizes
            dimensions = {dim_name: len(dim) for dim_name, dim in group.dimensions.items()}
            
            group_info = {
                'variables': list(group.variables.keys()),
                'dimensions': dimensions,
                'attributes': {attr: group.getncattr(attr) for attr in group.ncattrs()}
            }
            group_properties[group_name] = group_info

    return group_properties


In [None]:
#| eval: false
# fname = Path('../files/nc/encoding-test.nc')
# fname = Path('../../_data/output/dump/100-HELCOM-MORS-2018.nc')
#fname = Path('../../_data/output/190-geotraces-2021.nc')
fname = Path('../../_data/output/100-HELCOM-MORS-2024.nc')

properties = get_netcdf_group_properties(fname)

for key, val in properties.items():
    if isinstance(val, dict):
        print(f"{key}:")
        for sub_key, sub_val in val.items():
            print(f"  {sub_key}: {sub_val}")
    else:
        print(f"{key}: {val}")

biota:
  variables: ['lon', 'lat', 'smp_depth', 'time', 'nuclide', 'value', 'unit', 'dl', 'bio_group', 'species', 'body_part', 'drywt', 'wetwt']
  dimensions: {'id': 14873}
  attributes: {}
seawater:
  variables: ['lon', 'lat', 'smp_depth', 'tot_depth', 'time', 'nuclide', 'value', 'unit', 'dl', 'filt']
  dimensions: {'id': 20242}
  attributes: {}
sediment:
  variables: ['lon', 'lat', 'tot_depth', 'time', 'area', 'nuclide', 'value', 'unit', 'dl', 'sed_type', 'top', 'bottom']
  dimensions: {'id': 63868}
  attributes: {}


Return variable properties of the NetCDF file

In [None]:
#| exports
def get_netcdf_variable_properties(file_path: str, as_df: bool = False) -> dict | pd.DataFrame:
    """
    Retrieve properties of variables in each group of a NetCDF file.

    Parameters:
    file_path (str): Path to the NetCDF file
    as_df (bool): If True, returns a pandas DataFrame; if False, returns nested dictionary

    Returns:
    Union[dict, pd.DataFrame]: Properties of variables either as nested dictionary or DataFrame
    """
    var_properties = {}
    
    file = Path(file_path)
    if not file.exists():
        print(f'File not found: {file_path}')
        return var_properties

    with Dataset(file_path, 'r') as nc:
        for group_name, group in nc.groups.items():
            group_vars = {}
            for var_name, var in group.variables.items():
                var_info = {
                    'group': group_name,
                    'variable': var_name,
                    'data_type': var.dtype.str,
                    'dimensions_id': str(var.dimensions),
                    'dimensions_size': str(var.shape),
                }
                # Add variable attributes
                for attr in var.ncattrs():
                    var_info[f'attr_{attr}'] = str(getattr(var, attr))
                    
                group_vars[var_name] = var_info
            var_properties[group_name] = group_vars

    if not as_df:
        return var_properties
    
    # Convert to DataFrame
    rows = []
    for group_name, group_vars in var_properties.items():
        for var_name, var_info in group_vars.items():
            rows.append(var_info)
    
    df = pd.DataFrame(rows)
    
    # Reorder columns to put key information first
    first_cols = ['group', 'variable', 'dimensions_id', 'dimensions_size']
    other_cols = [col for col in df.columns if col not in first_cols]
    df = df[first_cols + other_cols]
    
    return df

In [None]:
#| eval: false
# fname = Path('../files/nc/encoding-test.nc')
# fname = Path('../../_data/output/dump/100-HELCOM-MORS-2018.nc')
#fname = Path('../../_data/output/190-geotraces-2021.nc')
fname = Path('../../_data/output/100-HELCOM-MORS-2024.nc')

get_netcdf_variable_properties(fname, as_df=True).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
group,biota,biota,biota,biota,biota,biota,biota,biota,biota,biota,...,sediment,sediment,sediment,sediment,sediment,sediment,sediment,sediment,sediment,sediment
variable,lon,lat,smp_depth,time,nuclide,value,unit,dl,bio_group,species,...,tot_depth,time,area,nuclide,value,unit,dl,sed_type,top,bottom
dimensions_id,"('id',)","('id',)","('id',)","('id',)","('id',)","('id',)","('id',)","('id',)","('id',)","('id',)",...,"('id',)","('id',)","('id',)","('id',)","('id',)","('id',)","('id',)","('id',)","('id',)","('id',)"
dimensions_size,"(14873,)","(14873,)","(14873,)","(14873,)","(14873,)","(14873,)","(14873,)","(14873,)","(14873,)","(14873,)",...,"(63868,)","(63868,)","(63868,)","(63868,)","(63868,)","(63868,)","(63868,)","(63868,)","(63868,)","(63868,)"
data_type,<f4,<f4,<f4,<u8,<i8,<f4,<i8,<i8,<i8,<i8,...,<f4,<u8,<i8,<i8,<f4,<i8,<i8,<i8,<f4,<f4
attr_long_name,Measurement longitude,Measurement latitude,Sample depth below seal level,Time of measurement,Nuclide,Activity,Unit,Detection limit,Biota group,Species,...,Total depth below seal level,Time of measurement,Marine area/region id,Nuclide,Activity,Unit,Detection limit,Sediment type,Top depth of sediment layer,Bottom depth of sediment layer
attr_standard_name,longitude,latitude,sample_depth_below_sea_floor,time,nuclide,activity,unit,detection_limit,biota_group_tbd,species,...,total_depth_below_sea_floor,time,area_id,nuclide,activity,unit,detection_limit,sediment_type_tbd,top_depth_of_sediment_layer_tbd,bottom_depth_of_sediment_layer_tbd
attr_units,degrees_east,degrees_north,m,seconds since 1970-01-01 00:00:00.0,,,,,,,...,m,seconds since 1970-01-01 00:00:00.0,,,,,,,,
attr_axis,,,Z,T,,,,,,,...,Z,T,,,,,,,,
attr_time_origin,,,,1970-01-01 00:00:00,,,,,,,...,,1970-01-01 00:00:00,,,,,,,,


Return the enum dictionary for a variable in a NetCDF file.

In [None]:
#| exports
def get_enum_dict(file_path: str, var_name: str) -> dict:
    """
    Get the enum dictionary for a variable in a NetCDF file.
    
    Parameters:
    file_path (str): Path to the NetCDF file
    var_name (str): Name of the variable to get enum for
    
    Returns:
    dict: Dictionary mapping enum names to values, or empty dict if not found
    """
    with Dataset(file_path, 'r') as nc:
        # Look for the variable in all groups
        enum_dict = {}
        for group_name in nc.groups:
            group = nc.groups[group_name]
            if var_name in group.variables:
                var = group.variables[var_name]
                if hasattr(var.datatype, 'enum_dict'):
                    nc_enum_dict = var.datatype.enum_dict       
                    # Store group info and enum dict
                    enum_dict[group_name] = {
                        'variable': var_name,
                        'enum_dict': nc_enum_dict
                    }
                    
        return enum_dict

In [None]:
#| eval: false
fname = Path('../../_data/output/100-HELCOM-MORS-2024.nc')
nuclide_mapping = get_enum_dict(fname, 'nuclide')
nuclide_mapping

{'biota': {'variable': 'nuclide',
  'enum_dict': {'NOT APPLICABLE': -1,
   'NOT AVAILABLE': 0,
   'h3': 1,
   'be7': 2,
   'c14': 3,
   'k40': 4,
   'cr51': 5,
   'mn54': 6,
   'co57': 7,
   'co58': 8,
   'co60': 9,
   'zn65': 10,
   'sr89': 11,
   'sr90': 12,
   'zr95': 13,
   'nb95': 14,
   'tc99': 15,
   'ru103': 16,
   'ru106': 17,
   'rh106': 18,
   'ag106m': 19,
   'ag108': 20,
   'ag108m': 21,
   'ag110m': 22,
   'sb124': 23,
   'sb125': 24,
   'te129m': 25,
   'i129': 28,
   'i131': 29,
   'cs127': 30,
   'cs134': 31,
   'cs137': 33,
   'ba140': 34,
   'la140': 35,
   'ce141': 36,
   'ce144': 37,
   'pm147': 38,
   'eu154': 39,
   'eu155': 40,
   'pb210': 41,
   'pb212': 42,
   'pb214': 43,
   'bi207': 44,
   'bi211': 45,
   'bi214': 46,
   'po210': 47,
   'rn220': 48,
   'rn222': 49,
   'ra223': 50,
   'ra224': 51,
   'ra225': 52,
   'ra226': 53,
   'ra228': 54,
   'ac228': 55,
   'th227': 56,
   'th228': 57,
   'th232': 59,
   'th234': 60,
   'pa234': 61,
   'u234': 62,
   'u

## Convert NetCDF to OpenRefine CSV


Maris NetCDF files can be converted to OpenRefine CSV files. The OpenRefine CSV files are compatible with the [OpenRefine](https://openrefine.org/) data cleaning tool which are used during the MARIS data cleaning process before loading into the MARIS database.


### WIP - TODO NetCDFOpenRefineDecoder .

1. Include ENUM with NetCDF file. 
2. Can enum (alone) be used to update the netcdf data to openrefine data?


Points to consider:
1. validate_enum_mappings
2. Generate dataframe of the netcdf file
3. Update columns to 'MARISCO' naming standard, i.e. capitals. - this generalisation step will assist if other decoders are added.
4. Convert column names from 'MARISCO' standard to 'CSV_VARS' standard. 
5. Convert values using CSV_DTYPES - the data standarised from the netcdf file will be considered the general standard (i.e. using enums values). - This too will assist if other decoders are added.



In [None]:
#| exports
class NetCDFDecoder:
    """Decode MARIS NetCDF files to human readable formats."""
    def __init__(self, 
                 dfs: Dict[str, pd.DataFrame], 
                 fname_in: str,  # Path to NetCDF file
                 dest_out: str, 
                 output_format:str, 
                 remap_vars: Dict[str, str],
                 verbose: bool=False
                ):
        fc.store_attr()        

In [None]:
def process_groups(self, ):
    """Process all groups in the dataset."""
    for group_name, df in self.dfs.items():
        self.process_group(group_name, df, self.remap_vars)

In [None]:
def process_group(self, group_name: str, df: pd.DataFrame, remap_vars: Dict[str, str]):
    """Process a single group, mapping column names using OR_VARS."""
    # Map column names using remap_vars
    df.columns = [remap_vars.get(col, col) for col in df.columns]
    # Save the processed DataFrame
    self.save_dataframe(group_name, df)

In [None]:
#| exports
@patch
def decode(self: NetCDFDecoder):
    "  Decode NetCDF to Human readable files."
    # Funvtion to rename the columns. 
    save_dataframes()
    
    return self.dfs
    

In [None]:
#| exports
@patch
def save_dataframes(self: NetCDFDecoder):
    """
    Save DataFrames to files in the specified format.
    
    Parameters:
        dest_path (str, optional): Base path for output files, without extension.
            If None, uses self.dest_fname's path without extension.
        output_format (str): Format to save files in. Options:
            - 'csv': Comma-separated values
            - 'excel': Excel spreadsheet (one sheet per group)
            - 'json': JSON format
            - 'parquet': Apache Parquet format
            - 'hdf': HDF5 format
            - 'pickle': Python pickle format
            - 'feather': Feather format
            - 'stata': Stata format
    """
    # Get base path without extension
    if self.dest_out is None:
            raise ValueError("No destination path provided")
    else:
        base_path = str(Path(self.dest_path).with_suffix(''))
    
    # Handle formats that combine all groups in one file
    if self.output_format == 'excel':
        output_path = f"{base_path}.xlsx"
        with pd.ExcelWriter(output_path) as writer:
            for group_name, df in self.dfs.items():
                df.to_excel(writer, sheet_name=group_name, index=False)
                if self.verbose:
                    print(f"Saved {group_name} to sheet in {output_path}")
    
    elif self.output_format == 'hdf':
        output_path = f"{base_path}.h5"
        with pd.HDFStore(output_path) as store:
            for group_name, df in self.dfs.items():
                store[group_name] = df
                if self.verbose:
                    print(f"Saved {group_name} to group in {output_path}")
    
    # Handle formats that create separate files for each group
    else:
        format_extensions = {
            'csv': '.csv',
            'json': '.json',
            'parquet': '.parquet',
            'pickle': '.pkl',
            'feather': '.feather',
            'stata': '.dta'
        }
        
        if self.output_format not in format_extensions:
            raise ValueError(f"Unsupported output format: {output_format}. Supported formats: {format_extensions.keys()}")
            
        extension = format_extensions[output_format]
        save_methods = {
            'csv': lambda df, path: df.to_csv(path, index=False),
            'json': lambda df, path: df.to_json(path),
            'parquet': lambda df, path: df.to_parquet(path),
            'pickle': lambda df, path: df.to_pickle(path),
            'feather': lambda df, path: df.to_feather(path),
            'stata': lambda df, path: df.to_stata(path)
        }
        
        for group_name, df in self.dfs.items():
            output_path = f"{base_path}_{group_name}{extension}"
            save_methods[self.output_format](df, output_path)
            
            if self.verbose:
                print(f"Saved {group_name} to {output_path}")

In [None]:
#|eval: false
fname = Path('../../_data/output/100-HELCOM-MORS-2024.nc')

decoder = NetCDFDecoder( 
                    src_fname= fname,  # Path to source NetCDF file
                    verbose=True
                 )
decoder.decode()

decoder.save_dataframes(dest_path=fname.with_suffix('.csv'), # Base name for output CSV files
output_format='csv')    # Saves separate CSV files

nc_enum_dict [nuclide]: {'NOT APPLICABLE': -1, 'NOT AVAILABLE': 0, 'h3': 1, 'be7': 2, 'c14': 3, 'k40': 4, 'cr51': 5, 'mn54': 6, 'co57': 7, 'co58': 8, 'co60': 9, 'zn65': 10, 'sr89': 11, 'sr90': 12, 'zr95': 13, 'nb95': 14, 'tc99': 15, 'ru103': 16, 'ru106': 17, 'rh106': 18, 'ag106m': 19, 'ag108': 20, 'ag108m': 21, 'ag110m': 22, 'sb124': 23, 'sb125': 24, 'te129m': 25, 'i129': 28, 'i131': 29, 'cs127': 30, 'cs134': 31, 'cs137': 33, 'ba140': 34, 'la140': 35, 'ce141': 36, 'ce144': 37, 'pm147': 38, 'eu154': 39, 'eu155': 40, 'pb210': 41, 'pb212': 42, 'pb214': 43, 'bi207': 44, 'bi211': 45, 'bi214': 46, 'po210': 47, 'rn220': 48, 'rn222': 49, 'ra223': 50, 'ra224': 51, 'ra225': 52, 'ra226': 53, 'ra228': 54, 'ac228': 55, 'th227': 56, 'th228': 57, 'th232': 59, 'th234': 60, 'pa234': 61, 'u234': 62, 'u235': 63, 'u238': 64, 'np237': 65, 'np239': 66, 'pu238': 67, 'pu239': 68, 'pu240': 69, 'pu241': 70, 'am240': 71, 'am241': 72, 'cm242': 73, 'cm243': 74, 'cm244': 75, 'cs134_137_tot': 76, 'pu239_240_tot': 77

In [None]:
#| eval: false
decoder.dfs['BIOTA'].columns

Index(['LON', 'LAT', 'SMP_DEPTH', 'TIME', 'NUCLIDE', 'VALUE', 'UNIT', 'DL',
       'BIO_GROUP', 'SPECIES', 'BODY_PART', 'DRYWT', 'WETWT'],
      dtype='object')