In [None]:
#| default_exp utils

# Utilities
> Various utilities

In [None]:
#| export
from pathlib import Path
from math import modf
from netCDF4 import Dataset
from fastcore.test import test_eq
import fastcore.all as fc
import pandas as pd
import numpy as np
from tqdm import tqdm 
import requests
from shapely import MultiPoint
import jellyfish as jf
from dataclasses import dataclass

from typing import List, Dict, Callable, Tuple, Optional, Union

from marisco.configs import cache_path


# from collections.abc import Callable

We define below useful constants throughout the package.

In [None]:
#| exports
# TBD: move to configs
NA = 'Not available'

## Core

Abstracting some common operations.

In [None]:
#| exports
def get_unique_across_dfs(dfs: Dict[str, pd.DataFrame],  # Dictionary of dataframes
                          col_name: str='NUCLIDE', # Column name to extract unique values from
                          as_df: bool=False, # Return a DataFrame of unique values
                          include_nchars: bool=False # Add a column with the number of characters in the value
                          ) -> List[str]: # Returns a list of unique column values across dataframes
    "Get a list of unique column values across dataframes."
    unique_values = list(set().union(*(df[col_name].unique() for df in dfs.values() if col_name in df.columns)))
    if not as_df:
        return unique_values
    else:
        df_uniques = pd.DataFrame(unique_values, columns=['value']).reset_index()
        if include_nchars: df_uniques['n_chars'] = df_uniques['value'].str.len()
        return df_uniques

Example of use:

In [None]:
dfs_test = {'SEAWATER': pd.DataFrame({'NUCLIDE': ['cs137', 'cs134_137_tot', 'cs134_137_tot']}),
            'BIOTA': pd.DataFrame({'NUCLIDE': ['cs137', 'cs134', 'cs134_137_tot']}),
            'SEDIMENT': pd.DataFrame({'NUCLIDE': ['cs134_137_tot', 'cs134_137_tot', 'cs134_137_tot']})}

fc.test_eq(set(get_unique_across_dfs(dfs_test, col_name='NUCLIDE')), 
           set(['cs134', 'cs137', 'cs134_137_tot']))

What if the column name is not in one of the dataframe?

In [None]:
dfs_test = {'SEAWATER': pd.DataFrame({'NUCLIDE': ['cs137', 'cs134_137_tot', 'cs134_137_tot']}),
            'BIOTA': pd.DataFrame({'NUCLIDE': ['cs137', 'cs134', 'cs134_137_tot']}),
            'SEDIMENT': pd.DataFrame({'NONUCLIDE': ['cs134_137_tot', 'cs134_137_tot', 'cs134_137_tot']})}

fc.test_eq(set(get_unique_across_dfs(dfs_test, col_name='NUCLIDE')), 
           set(['cs134', 'cs137', 'cs134_137_tot']))

In [None]:
get_unique_across_dfs(dfs_test, col_name='NUCLIDE', as_df=True, include_nchars=True)

Unnamed: 0,index,value,n_chars
0,0,cs134,5
1,1,cs134_137_tot,13
2,2,cs137,5


In [None]:
#| exports
class Remapper():
    "Remap a data provider lookup table to a MARIS lookup table using fuzzy matching."
    def __init__(self,
                 provider_lut_df: pd.DataFrame, # Data provider lookup table to be remapped
                 maris_lut_fn: Union[Callable, pd.DataFrame], # MARIS lookup table or function returning the path
                 maris_col_id: str, # MARIS lookup table column name for the id
                 maris_col_name: str, # MARIS lookup table column name for the name
                 provider_col_to_match: str, # Data provider lookup table column name for the name to match
                 provider_col_key: str, # Data provider lookup table column name for the key
                 fname_cache: str # Cache file name
                 ):
        fc.store_attr()
        self.cache_file = cache_path() / fname_cache
        # Check if maris_lut is a callable function or already a DataFrame
        if callable(maris_lut_fn):
            self.maris_lut = maris_lut_fn()
        else:
            self.maris_lut = maris_lut_fn
        self.lut = {}

    def generate_lookup_table(self, 
                              fixes={}, # Lookup table fixes
                              as_df=True, # Whether to return a DataFrame
                              overwrite=True):
        "Generate a lookup table from a data provider lookup table to a MARIS lookup table using fuzzy matching."
        self.fixes = fixes
        self.as_df = as_df
        if overwrite or not self.cache_file.exists():
            self._create_lookup_table()
            fc.save_pickle(self.cache_file, self.lut)
        else:
            self.lut = fc.load_pickle(self.cache_file)

        return self._format_output()

    def _create_lookup_table(self):
        df = self.provider_lut_df
        for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing"): 
            self._process_row(row)

    def _process_row(self, row):
        value_to_match = row[self.provider_col_to_match]
        if isinstance(value_to_match, str):  # Only process if value is a string
            # If value is in fixes, use the fixed value
            name_to_match = self.fixes.get(value_to_match, value_to_match)
            result = match_maris_lut(self.maris_lut, name_to_match, self.maris_col_id, self.maris_col_name).iloc[0]
            match = Match(result[self.maris_col_id], result[self.maris_col_name], 
                          value_to_match, result['score'])
            self.lut[row[self.provider_col_key]] = match
        else:
            # Handle non-string values (e.g., NaN)
            self.lut[row[self.provider_col_key]] = Match(-1, "Unknown", value_to_match, 0)
            
    def select_match(self, match_score_threshold:int=1, verbose:bool=False):
        if verbose:
            matched_len= len([v for v in self.lut.values() if v.match_score < match_score_threshold])
            print(f"{matched_len} entries matched the criteria, while {len(self.lut) - matched_len} entries had a match score of {match_score_threshold} or higher.")
        
        self.lut = {k: v for k, v in self.lut.items() if v.match_score >= match_score_threshold}
        return self._format_output()

    def _format_output(self):
        if not self.as_df: return self.lut
        df_lut = pd.DataFrame.from_dict(self.lut, orient='index', 
                                        columns=['matched_maris_name', 'source_name', 'match_score'])
        df_lut.index.name = 'source_key'
        return df_lut.sort_values(by='match_score', ascending=False)

In [None]:
#| hide 
# TBD: Setting unique universal id
# import hashlib
# combined_str = "32.123_-180.435_2022-01-01T00:00:00.000"
# hash_object = hashlib.sha256(combined_str.encode())
# unique_id = hash_object.hexdigest(); unique_id

## Validation

In [None]:
#| exports
# TBD: Assess if still needed
def has_valid_varname(
    var_names: List[str], # variable names
    cdl_path: str, # Path to MARIS CDL file (point of truth)
    group: Optional[str] = None, # Check if the variable names is contained in the group
):
    "Check that proposed variable names are in MARIS CDL"
    has_valid = True
    with Dataset(cdl_path) as nc:
        cdl_vars={}
        all_vars=[]
        # get variable names in CDL 
        for grp in nc.groups.values():
            # Create a list of var for each group
            vars = list(grp.variables.keys())
            cdl_vars[grp.name] = vars
            all_vars.extend(vars)
        
    if group != None:
        allowed_vars= cdl_vars[group]
    else: 
        # get unique 
        allowed_vars = list(set(all_vars))
        
    for name in var_names:
        if name not in allowed_vars:
            has_valid = False
            if group != None:
                print(f'"{name}" variable name not found in group "{group}" of MARIS CDL')
            else:
                print(f'"{name}" variable name not found in MARIS CDL')
    return has_valid  

In [None]:
# VARNAMES = ['lat', 'lon']
# test_eq(has_valid_varname(VARNAMES, './files/nc/maris-cdl.nc'), True)

In [None]:
# VARNAMES = ['ba140_invalid', 'ba140_dl']
# test_eq(has_valid_varname(VARNAMES, './files/nc/maris-cdl.nc'), False)

## Geoprocessing

In [None]:
#| exports
def get_bbox(df,
             coord_cols: Tuple[str, str] = ('LON', 'LAT')
            ):
    "Get the bounding box of a DataFrame."
    x, y = coord_cols        
    arr = [(row[x], row[y]) for _, row in df.iterrows()]
    return MultiPoint(arr).envelope

In [None]:
df = pd.DataFrame({'LON': np.linspace(-10, 5, 20), 'LAT':  np.linspace(40, 50, 20)})
bbox = get_bbox(df);

In [None]:
# To get `lon_min`, `lon_max`, `lat_min`, `lat_max`
bbox.bounds

(-10.0, 40.0, 5.0, 50.0)

In [None]:
# And its Well-Know Text representation
bbox.wkt

'POLYGON ((-10 40, 5 40, 5 50, -10 50, -10 40))'

In [None]:
# If unique (lon, lat)
df = pd.DataFrame({'LON': [0, 0], 'LAT':  [1, 1]})
bbox = get_bbox(df);

In [None]:
bbox.bounds

(0.0, 1.0, 0.0, 1.0)

In [None]:
#| exports
def ddmm_to_dd(
    ddmmmm: float # Coordinates in degrees/minutes decimal format
    ) -> float: # Coordinates in degrees decimal format
    # Convert degrees/minutes decimal to degrees decimal.
    mins, degs = modf(ddmmmm)
    mins = mins * 100
    return round(int(degs) + (mins / 60), 6)

In [None]:
fc.test_close(ddmm_to_dd(45.34), 45.566667)

## Downloaders

In [None]:
#| exports
def download_files_in_folder(
    owner: str, # GitHub owner
    repo: str, # GitHub repository
    src_dir: str, # Source directory
    dest_dir: str # Destination directory
    ):
    "Make a GET request to the GitHub API to get the contents of the folder."
    url = f"https://api.github.com/repos/{owner}/{repo}/contents/{src_dir}"
    response = requests.get(url)

    if response.status_code == 200:
        contents = response.json()

        # Iterate over the files and download them
        for item in contents:
            if item["type"] == "file":
                fname = item["name"]
                download_file(owner, repo, src_dir, dest_dir, fname)
    else:
        print(f"Error: {response.status_code}")

def download_file(owner, repo, src_dir, dest_dir, fname):
    # Make a GET request to the GitHub API to get the raw file contents
    url = f"https://raw.githubusercontent.com/{owner}/{repo}/master/{src_dir}/{fname}"
    response = requests.get(url)

    if response.status_code == 200:
        # Save the file locally
        with open(Path(dest_dir) / fname, "wb") as file:
            file.write(response.content)
        print(f"{fname} downloaded successfully.")
    else:
        print(f"Error: {response.status_code}")

## WorRMS
The [World Register of Marine Species (WorMS)](https://www.marinespecies.org) is an authoritative classification and catalogue of marine names. It provides a REST API (among others) allowing to "fuzzy" match any species name you might encounter in marine data sources names againt their own database. There are several types of matches as described [here](https://www.marinespecies.org/tutorial_taxonmatch.php).

In [None]:
#| exports
def match_worms(
    name: str # Name of species to look up in WoRMS
    ):
    "Lookup `name` in WoRMS (fuzzy match)."
    url = 'https://www.marinespecies.org/rest/AphiaRecordsByMatchNames'
    params = {
        'scientificnames[]': [name],
        'marine_only': 'true'
    }
    headers = {
        'accept': 'application/json'
    }
    
    response = requests.get(url, params=params, headers=headers)
    
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        data = response.json()
        return data
    else:
        return -1

For instance:

In [None]:
#|eval: false
match_worms('Aristeus antennatus')

[[{'AphiaID': 107083,
   'url': 'https://www.marinespecies.org/aphia.php?p=taxdetails&id=107083',
   'scientificname': 'Aristeus antennatus',
   'authority': '(Risso, 1816)',
   'status': 'accepted',
   'unacceptreason': None,
   'taxonRankID': 220,
   'rank': 'Species',
   'valid_AphiaID': 107083,
   'valid_name': 'Aristeus antennatus',
   'valid_authority': '(Risso, 1816)',
   'parentNameUsageID': 106807,
   'kingdom': 'Animalia',
   'phylum': 'Arthropoda',
   'class': 'Malacostraca',
   'order': 'Decapoda',
   'family': 'Aristeidae',
   'genus': 'Aristeus',
   'citation': 'DecaNet eds. (2024). DecaNet. Aristeus antennatus (Risso, 1816). Accessed through: World Register of Marine Species at: https://www.marinespecies.org/aphia.php?p=taxdetails&id=107083 on 2024-12-17',
   'lsid': 'urn:lsid:marinespecies.org:taxname:107083',
   'isMarine': 1,
   'isBrackish': 0,
   'isFreshwater': 0,
   'isTerrestrial': 0,
   'isExtinct': 0,
   'match_type': 'exact',
   'modified': '2022-08-24T09:48:1

In [None]:
#| hide 
# open dbo_species
#from tqdm import tqdm
#results = []
#species = pd.read_excel(species_lut_path()).species
#for i, name in tqdm(enumerate(species), total=len(species)):
#    if i > 1:
#        worms_match = match_worms(name)
#        if worms_match != -1:
#            results.append(worms_match[0][0])
# np.unique(np.array([result['phylum'] for result in results]))
#len(maris_worms_matches)
#maris_worms_matches = fc.load_pickle('./files/pkl/maris-worms-matches.pkl')
#np.unique(np.array([result['phylum'] for result in maris_worms_matches]))
#len([result for result in maris_worms_matches if result['status'] == 'accepted'])

## Fuzzy matching for MARIS Lookup Tables
Using https://jamesturk.github.io/jellyfish fuzzy matching distance metrics.


In [None]:
#| exports
@dataclass
class Match:
    "Match between a data provider name and a MARIS lookup table."
    matched_id: int
    matched_maris_name: str
    source_name: str
    match_score: int

In [None]:
#| exports
def match_maris_lut(
    lut: Union[str, pd.DataFrame, Path], # Either str, Path or DataFrame
    data_provider_name: str, # Name of data provider nomenclature item to look up 
    maris_id: str, # Id of MARIS lookup table nomenclature item to match
    maris_name: str, # Name of MARIS lookup table nomenclature item to match
    dist_fn: Callable = jf.levenshtein_distance, # Distance function
    nresults: int = 10 # Maximum number of results to return
) -> pd.DataFrame:
    "Fuzzy matching data provider and MARIS lookup tables (e.g biota species, sediments, ...)."
    if isinstance(lut, str) or isinstance(lut, Path):
        df = pd.read_excel(lut)  # Load the LUT if a path is provided
    elif isinstance(lut, pd.DataFrame):
        df = lut  # Use the DataFrame directly if provided
    else:
        raise ValueError("lut must be either a file path or a DataFrame")

    df = df.dropna(subset=[maris_name])
    df = df.astype({maris_id: 'int'})
    df['score'] = df[maris_name].str.lower().apply(lambda x: dist_fn(data_provider_name.lower(), x))
    df = df.sort_values(by='score', ascending=True)[:nresults]
    return df[[maris_id, maris_name, 'score']]

Below an example trying to match the name "PLANKTON" with `dbo_species_cleaned.xlsx` MARIS biota species lookup table:

In [None]:
lut_fname = '../files/lut/dbo_species_cleaned.xlsx'
match_maris_lut(lut_fname, data_provider_name='PLANKTON', 
                maris_id='species_id', maris_name='species')

Unnamed: 0,species_id,species,score
281,280,Plankton,0
696,695,Zooplankton,3
633,632,Palaemon,4
697,696,Phytoplankton,5
812,811,Chanos,5
160,159,Neuston,5
234,233,Penaeus,6
1458,1457,Lamnidae,6
1438,1437,Labrus,6
1527,1526,Favites,6


Below, we demonstrate matching the laboratory name "Central Mining Institute, Poland" with the MARIS lab lookup table from `dbo_lab.xlsx`. This example utilizes the `lab` and `country` columns. Note that in this instance, `df_lut` is passed directly as the `lut` argument.

In [None]:
lut_fname = '../files/lut/dbo_lab.xlsx'
df_lut=pd.read_excel(lut_fname)
df_lut['lab_country'] = df_lut['lab'] + '_' + df_lut['country']

match_maris_lut(lut=df_lut, data_provider_name='Central Mining Institute, Poland', 
                maris_id='lab_id', maris_name='lab_country')

Unnamed: 0,lab_id,lab_country,score
6,5,Central Mining Institute_Poland,2
203,202,Polytechnic Institute_Romania,18
282,281,Norwegian Polar Institute_Norway,21
113,112,Nuclear Research Institute_Vietnam,22
246,245,Paul Scherrer Institute_Switzerland,22
136,135,Nuclear Energy Board_Ireland,23
471,474,Kobe University_Japan,23
429,432,Qatar University_Qatar,23
174,173,Interfaculty Reactor Institute_Netherlands,23
177,176,RIKILT_Netherlands,23


Below an example trying to match the name "GLACIAL" with dbo_sedtype.xlsx MARIS sediment lookup table:

In [None]:
lut_fname = '../files/lut/dbo_sedtype.xlsx'
match_maris_lut(lut_fname, data_provider_name='GLACIAL', 
                maris_id='sedtype_id', maris_name='sedtype')

Unnamed: 0,sedtype_id,sedtype,score
26,25,Glacial,0
3,2,Gravel,4
2,1,Clay,5
51,50,Glacial clay,5
4,3,Marsh,6
7,6,Sand,6
13,12,Silt,6
15,14,Sludge,6
27,26,Soft,7
52,51,Soft clay,7


In [None]:
lut_fname = '../files/lut/dbo_nuclide.xlsx'
match_maris_lut(lut_fname, data_provider_name='CS-137', 
                maris_id='nuclide_id', maris_name='nc_name')

Unnamed: 0,nuclide_id,nc_name,score
31,33,cs137,1
30,31,cs134,2
29,30,cs127,2
99,102,cs136,2
109,112,sb127,3
111,114,ce139,3
25,24,sb125,4
36,38,pm147,4
28,29,i131,4
110,113,ba133,4


## Downloaders

In [None]:
#| exports
def download_files_in_folder(
    owner: str, # GitHub owner
    repo: str, # GitHub repository
    src_dir: str, # Source directory
    dest_dir: str # Destination directory
    ):
    "Make a GET request to the GitHub API to get the contents of the folder"
    url = f"https://api.github.com/repos/{owner}/{repo}/contents/{src_dir}"
    response = requests.get(url)

    if response.status_code == 200:
        contents = response.json()

        # Iterate over the files and download them
        for item in contents:
            if item["type"] == "file":
                fname = item["name"]
                download_file(owner, repo, src_dir, dest_dir, fname)
    else:
        print(f"Error: {response.status_code}")

def download_file(owner, repo, src_dir, dest_dir, fname):
    # Make a GET request to the GitHub API to get the raw file contents
    url = f"https://raw.githubusercontent.com/{owner}/{repo}/master/{src_dir}/{fname}"
    response = requests.get(url)

    if response.status_code == 200:
        # Save the file locally
        with open(Path(dest_dir) / fname, "wb") as file:
            file.write(response.content)
        print(f"{fname} downloaded successfully.")
    else:
        print(f"Error: {response.status_code}")

## Test

In [None]:
#| exports
def test_dfs(
    dfs1: Dict[str, pd.DataFrame], # First dictionary of DataFrames to compare 
    dfs2: Dict[str, pd.DataFrame] # Second dictionary of DataFrames to compare
    ) -> None: # It raises an `AssertionError` if the DataFrames are not equal
    "Compare two dictionaries of DataFrames for equality (also ensuring that columns are in the same order)."
    for grp in dfs1.keys():
        df1, df2 = (df.sort_index() for df in (dfs1[grp], dfs2[grp]))
        fc.test_eq(df1, df2.reindex(columns=df1.columns))

## NetCDF Utilities

NetCDF to a dictionary of Data Frames

In [None]:
#| exports
def nc_to_dfs(
    fname: str # Path to NetCDF file
    ) -> dict: # Dictionary with group names as keys and pandas DataFrames as values
    "Convert a NetCDF (with groups) file to a dictionary of dataframes."
    dfs = {}
    
    with Dataset(fname, 'r') as nc:
        # Process each group in the NetCDF file
        for group_name in nc.groups:
            group = nc.groups[group_name]
            
            # Get all variables in the group
            data = {}
            for var_name in group.variables:
                # Skip dimension variables (like 'id')
                if var_name not in group.dimensions:
                    data[var_name] = group.variables[var_name][:]
            
            # Convert to DataFrame
            df = pd.DataFrame(data)
            
            # Convert time from seconds since epoch if present
            if 'time' in df.columns:
                df['time'] = pd.to_datetime(df['time'], unit='s')
                
            dfs[group_name.upper()] = df
    
    return dfs

Example usage:

In [None]:
#| eval: false
# fname = Path('../../_data/output/190-geotraces-2021.nc')
fname = Path('../../_data/output/tepco.nc')

dfs = nc_to_dfs(fname)

for grp, df in dfs.items():
    print('group:', grp)
    print(f'shape: {df.shape}')
    print(df.head(), '\n')

group: SEAWATER
shape: (21477, 49)
   sample         lon    lat                time  h3  h3_dl  mn54  mn54_dl  \
0       0  141.029999  37.32 2011-03-21 23:15:00 NaN    NaN   NaN      NaN   
1       1  141.029999  37.32 2011-03-22 14:28:00 NaN    NaN   NaN      NaN   
2       2  141.029999  37.32 2011-03-23 13:51:00 NaN    NaN   NaN      NaN   
3       3  141.029999  37.32 2011-03-24 09:30:00 NaN    NaN   NaN      NaN   
4       4  141.029999  37.32 2011-03-25 10:00:00 NaN    NaN   NaN      NaN   

   co58  co58_dl  ...  te132  te132_dl   i132  i132_dl  cs136  cs136_dl  \
0   5.7      7.6  ...    NaN       NaN  160.0     44.0    6.7       4.7   
1   NaN     15.0  ...    NaN       NaN    NaN     88.0    NaN       7.8   
2   NaN      NaN  ...    NaN       NaN  200.0     58.0    NaN       NaN   
3   NaN      NaN  ...    NaN       NaN  120.0     88.0   68.0      49.0   
4   NaN      NaN  ...   13.0       7.4   58.0     22.0    4.4       3.2   

   tbeta  tbeta_dl  talpha  talpha_dl  
0    

Return properties of the NetCDF file

In [None]:
#| exports
def get_netcdf_properties(file_path: str) -> dict:
    """
    Retrieve general properties of a NetCDF file.

    Parameters:
    file_path (str): Path to the NetCDF file.

    Returns:
    dict: A dictionary containing file properties such as size, format, and dimensions.
    """
    properties = {}
    
    file = Path(file_path)
    
    if not file.exists():
        print(f'File not found: {file_path}')
        return properties

    # Get file size
    properties['file_size_bytes'] = file.stat().st_size
    
    # Open the NetCDF file
    with Dataset(file_path, 'r') as nc:
        # Get file format
        properties['file_format'] = nc.file_format

        # Get groups
        properties['groups'] = list(nc.groups.keys())
        
        # Get global attributes
        properties['global_attributes'] = {attr: nc.getncattr(attr) for attr in nc.ncattrs()}
    
    return properties

Example usage:

In [None]:
#| eval: false
# fname = Path('../files/nc/encoding-test.nc')
# fname = Path('../../_data/output/dump/100-HELCOM-MORS-2018.nc')
#fname = Path('../../_data/output/190-geotraces-2021.nc')
fname = Path('../../_data/output/100-HELCOM-MORS-2024.nc')

properties = get_netcdf_properties(fname)

for key, val in properties.items():
    if isinstance(val, dict):
        print(f"{key}:")
        for sub_key, sub_val in val.items():
            print(f"  {sub_key}: {sub_val}")
    else:
        print(f"{key}: {val}")

file_size_bytes: 864768
file_format: NETCDF4
groups: ['biota', 'seawater', 'sediment']
global_attributes:
  id: TBD
  title: Environmental database - Helsinki Commission Monitoring of Radioactive Substances
  summary: MORS Environment database has been used to collate data resulting from monitoring of environmental radioactivity in the Baltic Sea based on HELCOM Recommendation 26/3.

The database is structured according to HELCOM Guidelines on Monitoring of Radioactive Substances (https://www.helcom.fi/wp-content/uploads/2019/08/Guidelines-for-Monitoring-of-Radioactive-Substances.pdf), which specifies reporting format, database structure, data types and obligatory parameters used for reporting data under Recommendation 26/3.

The database is updated and quality assured annually by HELCOM MORS EG.
  keywords: oceanography, Earth Science > Oceans > Ocean Chemistry> Radionuclides, Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure, Earth Science > Oceans

Return group properties of the NetCDF file

In [None]:
#| exports
def get_netcdf_group_properties(file_path: str) -> dict:
    """
    Retrieve properties of each group in a NetCDF file, including dimension sizes.

    Parameters:
    file_path (str): Path to the NetCDF file.

    Returns:
    dict: A dictionary containing properties of each group such as variables, dimensions with sizes, and attributes.
    """
    group_properties = {}

    file = Path(file_path)

    if not file.exists():
        print(f'File not found: {file_path}')
        return group_properties

    with Dataset(file_path, 'r') as nc:
        # Iterate over each group in the NetCDF file
        for group_name, group in nc.groups.items():
            # Get dimensions with their sizes
            dimensions = {dim_name: len(dim) for dim_name, dim in group.dimensions.items()}
            
            group_info = {
                'variables': list(group.variables.keys()),
                'dimensions': dimensions,
                'attributes': {attr: group.getncattr(attr) for attr in group.ncattrs()}
            }
            group_properties[group_name] = group_info

    return group_properties


In [None]:
#| eval: false
# fname = Path('../files/nc/encoding-test.nc')
# fname = Path('../../_data/output/dump/100-HELCOM-MORS-2018.nc')
#fname = Path('../../_data/output/190-geotraces-2021.nc')
fname = Path('../../_data/output/100-HELCOM-MORS-2024.nc')

properties = get_netcdf_group_properties(fname)

for key, val in properties.items():
    if isinstance(val, dict):
        print(f"{key}:")
        for sub_key, sub_val in val.items():
            print(f"  {sub_key}: {sub_val}")
    else:
        print(f"{key}: {val}")

biota:
  variables: ['lon', 'lat', 'smp_depth', 'time', 'nuclide', 'value', 'unit', 'dl', 'bio_group', 'species', 'body_part', 'drywt', 'wetwt']
  dimensions: {'id': 14873}
  attributes: {}
seawater:
  variables: ['lon', 'lat', 'smp_depth', 'tot_depth', 'time', 'nuclide', 'value', 'unit', 'dl', 'filt']
  dimensions: {'id': 20242}
  attributes: {}
sediment:
  variables: ['lon', 'lat', 'tot_depth', 'time', 'area', 'nuclide', 'value', 'unit', 'dl', 'sed_type', 'top', 'bottom']
  dimensions: {'id': 63868}
  attributes: {}


Return variable properties of the NetCDF file

In [None]:
#| exports
def get_netcdf_variable_properties(file_path: str, as_df: bool = False) -> dict | pd.DataFrame:
    """
    Retrieve properties of variables in each group of a NetCDF file.

    Parameters:
    file_path (str): Path to the NetCDF file
    as_df (bool): If True, returns a pandas DataFrame; if False, returns nested dictionary

    Returns:
    Union[dict, pd.DataFrame]: Properties of variables either as nested dictionary or DataFrame
    """
    var_properties = {}
    
    file = Path(file_path)
    if not file.exists():
        print(f'File not found: {file_path}')
        return var_properties

    with Dataset(file_path, 'r') as nc:
        for group_name, group in nc.groups.items():
            group_vars = {}
            for var_name, var in group.variables.items():
                var_info = {
                    'group': group_name,
                    'variable': var_name,
                    'data_type': var.dtype.str,
                    'dimensions_id': str(var.dimensions),
                    'dimensions_size': str(var.shape),
                }
                # Add variable attributes
                for attr in var.ncattrs():
                    var_info[f'attr_{attr}'] = str(getattr(var, attr))
                    
                group_vars[var_name] = var_info
            var_properties[group_name] = group_vars

    if not as_df:
        return var_properties
    
    # Convert to DataFrame
    rows = []
    for group_name, group_vars in var_properties.items():
        for var_name, var_info in group_vars.items():
            rows.append(var_info)
    
    df = pd.DataFrame(rows)
    
    # Reorder columns to put key information first
    first_cols = ['group', 'variable', 'dimensions_id', 'dimensions_size']
    other_cols = [col for col in df.columns if col not in first_cols]
    df = df[first_cols + other_cols]
    
    return df

In [None]:
#| eval: false
# fname = Path('../files/nc/encoding-test.nc')
# fname = Path('../../_data/output/dump/100-HELCOM-MORS-2018.nc')
#fname = Path('../../_data/output/190-geotraces-2021.nc')
fname = Path('../../_data/output/100-HELCOM-MORS-2024.nc')

get_netcdf_variable_properties(fname, as_df=True).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
group,biota,biota,biota,biota,biota,biota,biota,biota,biota,biota,...,sediment,sediment,sediment,sediment,sediment,sediment,sediment,sediment,sediment,sediment
variable,lon,lat,smp_depth,time,nuclide,value,unit,dl,bio_group,species,...,tot_depth,time,area,nuclide,value,unit,dl,sed_type,top,bottom
dimensions_id,"('id',)","('id',)","('id',)","('id',)","('id',)","('id',)","('id',)","('id',)","('id',)","('id',)",...,"('id',)","('id',)","('id',)","('id',)","('id',)","('id',)","('id',)","('id',)","('id',)","('id',)"
dimensions_size,"(14873,)","(14873,)","(14873,)","(14873,)","(14873,)","(14873,)","(14873,)","(14873,)","(14873,)","(14873,)",...,"(63868,)","(63868,)","(63868,)","(63868,)","(63868,)","(63868,)","(63868,)","(63868,)","(63868,)","(63868,)"
data_type,<f4,<f4,<f4,<u8,<i8,<f4,<i8,<i8,<i8,<i8,...,<f4,<u8,<i8,<i8,<f4,<i8,<i8,<i8,<f4,<f4
attr_long_name,Measurement longitude,Measurement latitude,Sample depth below seal level,Time of measurement,Nuclide,Activity,Unit,Detection limit,Biota group,Species,...,Total depth below seal level,Time of measurement,Marine area/region id,Nuclide,Activity,Unit,Detection limit,Sediment type,Top depth of sediment layer,Bottom depth of sediment layer
attr_standard_name,longitude,latitude,sample_depth_below_sea_floor,time,nuclide,activity,unit,detection_limit,biota_group_tbd,species,...,total_depth_below_sea_floor,time,area_id,nuclide,activity,unit,detection_limit,sediment_type_tbd,top_depth_of_sediment_layer_tbd,bottom_depth_of_sediment_layer_tbd
attr_units,degrees_east,degrees_north,m,seconds since 1970-01-01 00:00:00.0,,,,,,,...,m,seconds since 1970-01-01 00:00:00.0,,,,,,,,
attr_axis,,,Z,T,,,,,,,...,Z,T,,,,,,,,
attr_time_origin,,,,1970-01-01 00:00:00,,,,,,,...,,1970-01-01 00:00:00,,,,,,,,


Return the enum dictionary for a variable in a NetCDF file.

In [None]:
#| exports
def get_enum_dict(file_path: str, var_name: str) -> dict:
    """
    Get the enum dictionary for a variable in a NetCDF file.
    
    Parameters:
    file_path (str): Path to the NetCDF file
    var_name (str): Name of the variable to get enum for
    
    Returns:
    dict: Dictionary mapping enum names to values, or empty dict if not found
    """
    with Dataset(file_path, 'r') as nc:
        # Look for the variable in all groups
        enum_dict = {}
        for group_name in nc.groups:
            group = nc.groups[group_name]
            if var_name in group.variables:
                var = group.variables[var_name]
                if hasattr(var.datatype, 'enum_dict'):
                    nc_enum_dict = var.datatype.enum_dict       
                    # Store group info and enum dict
                    enum_dict[group_name] = {
                        'variable': var_name,
                        'enum_dict': nc_enum_dict
                    }
                    
        return enum_dict

In [None]:
#| eval: false
fname = Path('../../_data/output/100-HELCOM-MORS-2024.nc')
nuclide_mapping = get_enum_dict(fname, 'nuclide')
nuclide_mapping

{'biota': {'variable': 'nuclide',
  'enum_dict': {'NOT APPLICABLE': -1,
   'NOT AVAILABLE': 0,
   'h3': 1,
   'be7': 2,
   'c14': 3,
   'k40': 4,
   'cr51': 5,
   'mn54': 6,
   'co57': 7,
   'co58': 8,
   'co60': 9,
   'zn65': 10,
   'sr89': 11,
   'sr90': 12,
   'zr95': 13,
   'nb95': 14,
   'tc99': 15,
   'ru103': 16,
   'ru106': 17,
   'rh106': 18,
   'ag106m': 19,
   'ag108': 20,
   'ag108m': 21,
   'ag110m': 22,
   'sb124': 23,
   'sb125': 24,
   'te129m': 25,
   'i129': 28,
   'i131': 29,
   'cs127': 30,
   'cs134': 31,
   'cs137': 33,
   'ba140': 34,
   'la140': 35,
   'ce141': 36,
   'ce144': 37,
   'pm147': 38,
   'eu154': 39,
   'eu155': 40,
   'pb210': 41,
   'pb212': 42,
   'pb214': 43,
   'bi207': 44,
   'bi211': 45,
   'bi214': 46,
   'po210': 47,
   'rn220': 48,
   'rn222': 49,
   'ra223': 50,
   'ra224': 51,
   'ra225': 52,
   'ra226': 53,
   'ra228': 54,
   'ac228': 55,
   'th227': 56,
   'th228': 57,
   'th232': 59,
   'th234': 60,
   'pa234': 61,
   'u234': 62,
   'u