In [None]:
#| default_exp handlers.data_format_transformation

# Data format transformation  

> A data pipeline handler that transforms MARIS data between different formats. The primary focus is converting NetCDF data into human-readable formats (like CSV, Excel) while preserving data integrity and maintaining standardized variable names and units. This handler implements a modular transformation pipeline using callbacks for each processing step.

:::{.callout-tip}

For new MARIS users, please refer to [Understanding MARIS Data Formats (NetCDF and Open Refine)](https://github.com/franckalbinet/marisco/tree/main/install_configure_guide) for detailed information.

:::

# Dependencies
> Required packages and internal modules for data format transformations

In [None]:
#| export
from pathlib import Path
from netCDF4 import Dataset
import pandas as pd
from fastcore.basics import patch, store_attr
import fastcore.all as fc
from typing import Dict

from marisco.configs import (
    NC_VARS,
    OR_VARS,
    NC_GROUPS,
    Enums,
    lut_path
)

from marisco.callbacks import (
    Callback,
    Transformer,
    DecodeTimeCB
)  
    
from marisco.decoders import (
        NetCDFDecoder
    )


## Configuration and File Paths

In [None]:
# | exports
fname_in =  Path('../../_data/output/100-HELCOM-MORS-2024.nc')
fname_out = fname_in.with_suffix('.csv')

## Data Loading

Load and validate data from standardized MARIS NetCDF files. The NetCDF files follow CF conventions and include standardized variable names, units, and metadata according to MARIS specifications.

In [None]:
def load_to_dataframes(fname:str, verbose: bool = False):
    """Load NetCDF groups into DataFrames with standardized column names."""
    dfs = {}
    with Dataset(fname, 'r') as nc:
        for group_name in nc.groups:
            group = nc.groups[group_name]
            # Get all variables in the group
            data = {}
            for var_name, var in group.variables.items():
                if var_name not in group.dimensions:  # Skip dimension variables
                    data[var_name] = var[:]
            # Convert to DataFrame
            df = pd.DataFrame(data)
            # Rename columns using NC_VARS mapping
            rename_map = {nc_var: col for col, nc_var in NC_VARS.items() 
                         if nc_var in df.columns}
            df = df.rename(columns=rename_map)
            dfs[group_name.upper()] = df
            if verbose:
                print(f"Loaded group {group_name} with columns: {df.columns.tolist()}")
    
    return dfs

In [None]:
dfs = load_to_dataframes(fname_in, verbose=True)

Loaded group biota with columns: ['LON', 'LAT', 'SMP_DEPTH', 'TIME', 'NUCLIDE', 'VALUE', 'UNIT', 'DL', 'BIO_GROUP', 'SPECIES', 'BODY_PART', 'DRYWT', 'WETWT']
Loaded group seawater with columns: ['LON', 'LAT', 'SMP_DEPTH', 'TOT_DEPTH', 'TIME', 'NUCLIDE', 'VALUE', 'UNIT', 'DL', 'FILT']
Loaded group sediment with columns: ['LON', 'LAT', 'TOT_DEPTH', 'TIME', 'AREA', 'NUCLIDE', 'VALUE', 'UNIT', 'DL', 'SED_TYPE', 'TOP', 'BOTTOM']


# Validate NetCDF Enumerations

Verify that enumerated values in the NetCDF file match MARIS lookup tables.

:::{.callout-tip}

**FEEDBACK TO DATA PROVIDER**: The enumeration validation process is a diagnostic step that identifies inconsistencies between NetCDF enumerations and MARIS lookup tables. While this validation does not modify the dataset, it generates detailed feedback about any mismatches or undefined values. 


:::

In [None]:
#| exports
class ValidateEnumsCB(Callback):
    "Validate enumeration mappings between NetCDF file and MARIS lookup tables."
    def __init__(self, 
                src_fname: str,  # Path to NetCDF file
                enums: Enums,    # MARIS lookup table enums
                verbose: bool = False
                ):
        fc.store_attr()
        
    def __call__(self, tfm: Transformer):
        """Process each group in the NetCDF file and validate its enums."""
        with Dataset(self.src_fname, 'r') as nc:
            for group_name in nc.groups:
                group = nc.groups[group_name]
                self._validate_group(group, group_name)
    
    def _validate_group(self, group, group_name: str):
        """Validate enum mappings for a specific group."""
        for var_name, var in group.variables.items():
            if not hasattr(var.datatype, 'enum_dict'): 
                continue
            
            nc_enum_dict = var.datatype.enum_dict
            if self.verbose:
                print(f"nc_enum_dict [{var_name}]:", nc_enum_dict)

            # Get original column name from NC_VARS mapping
            original_col = next((col for col, nc_var in NC_VARS.items() 
                               if nc_var == var_name), None)
            if not original_col: 
                continue

            # Compare enum mappings
            self._compare_mappings(
                nc_enum_dict,
                self.enums.types[original_col],
                group_name,
                var_name,
                original_col
            )
    
    def _compare_mappings(self, nc_dict: dict, lut_dict: dict, 
                         group_name: str, var_name: str, col_name: str):
        """Compare NetCDF enum dictionary with lookup table dictionary."""
        if self.verbose:
            print(f"lut_enum [{col_name}]:", lut_dict)
            
        # Check for mismatches between NetCDF and lookup table
        for key, value in nc_dict.items():
            if key not in lut_dict or lut_dict[key] != value:
                print(f"\nWarning: Enum mismatch in {group_name}/{var_name}")
                print(f"NetCDF value: {key} -> {value}")
                print(f"Lookup value: {key} -> {lut_dict.get(key, 'Not found')}")        

In [None]:
#| eval: false
dfs = load_to_dataframes(fname_in)
tfm = Transformer(
    dfs,
    cbs=[
        ValidateEnumsCB(
            src_fname=fname_in,
            enums=Enums(lut_src_dir=lut_path()),
            #verbose=True
        ),
    ]
)

tfm()


{'BIOTA':              LON        LAT  SMP_DEPTH        TIME  NUCLIDE       VALUE  UNIT  \
 0      12.316667  54.283333        NaN  1348358400       31    0.010140     5   
 1      12.316667  54.283333        NaN  1348358400        4  135.300003     5   
 2      12.316667  54.283333        NaN  1348358400        9    0.013980     5   
 3      12.316667  54.283333        NaN  1348358400       33    4.338000     5   
 4      12.316667  54.283333        NaN  1348358400       31    0.009614     5   
 ...          ...        ...        ...         ...      ...         ...   ...   
 14868  19.000000  54.583302       61.0  1519603200       53    0.043000     5   
 14869  15.500000  54.333302       65.0  1518480000        4   98.000000     5   
 14870  15.500000  54.333302       65.0  1518480000       33    3.690000     5   
 14871  15.500000  54.333302       65.0  1518480000       53    0.049000     5   
 14872  19.433300  54.363899        NaN  1538524800       33    0.830000     5   
 
     

# Validate NetCDF Variables

Verify that variable names in the NetCDF file match those used in MARIS ternminogy, 

In [None]:
#| exports
class ValidateNetCDFVarsCB(Callback):
    " Validate that all variables in the NetCDF file are included in NC_VARS mapping. Identifies and reports any unmapped variables."
    def __init__(self, 
                src_fname: str,  # Path to NetCDF file
                verbose: bool = False
                ):
        fc.store_attr()
        
    def __call__(self, tfm: Transformer):
        """Check each group's variables against NC_VARS mapping."""
        unmapped_vars = {}
        
        with Dataset(self.src_fname, 'r') as nc:
            for group_name in nc.groups:
                group = nc.groups[group_name]
                group_vars = set(group.variables.keys())
                mapped_vars = {v for k, v in NC_VARS.items()}
                unmapped = group_vars - mapped_vars - {'id'}  # Exclude dimension variables
                
                if unmapped:
                    unmapped_vars[group_name] = unmapped
                    if self.verbose:
                        print(f"\nWarning: Unmapped variables in group {group_name}:")
                        print(f"Variables: {unmapped}")
        

In [None]:
#| eval: false
dfs = load_to_dataframes(fname_in)
tfm = Transformer(
    dfs,
    cbs=[
        ValidateNetCDFVarsCB(
            src_fname=fname_in,
            verbose=True
        ),
    ]
)

tfm()


{'BIOTA':              LON        LAT  SMP_DEPTH        TIME  NUCLIDE       VALUE  UNIT  \
 0      12.316667  54.283333        NaN  1348358400       31    0.010140     5   
 1      12.316667  54.283333        NaN  1348358400        4  135.300003     5   
 2      12.316667  54.283333        NaN  1348358400        9    0.013980     5   
 3      12.316667  54.283333        NaN  1348358400       33    4.338000     5   
 4      12.316667  54.283333        NaN  1348358400       31    0.009614     5   
 ...          ...        ...        ...         ...      ...         ...   ...   
 14868  19.000000  54.583302       61.0  1519603200       53    0.043000     5   
 14869  15.500000  54.333302       65.0  1518480000        4   98.000000     5   
 14870  15.500000  54.333302       65.0  1518480000       33    3.690000     5   
 14871  15.500000  54.333302       65.0  1518480000       53    0.049000     5   
 14872  19.433300  54.363899        NaN  1538524800       33    0.830000     5   
 
     

# Remap to human readable 

In [None]:
#| exports
class ConvertToHumanReadableCB(Callback):
    """
    Convert enum values in DataFrames to their human-readable format.
    Uses the enum dictionary keys as the human-readable values.
    """
    def __init__(self, 
                src_fname: str,  # Path to NetCDF file
                verbose: bool = False
                ):
        fc.store_attr()
        
    def __call__(self, tfm: Transformer):
        """Convert numeric enum values to human-readable strings."""
        with Dataset(self.src_fname, 'r') as nc:
            for group_name, df in tfm.dfs.items():
                nc_group_name = NC_GROUPS[group_name]
                group = nc.groups[nc_group_name]
                
                # Process each variable that has an enum
                for var_name, var in group.variables.items():
                    if hasattr(var.datatype, 'enum_dict'):
                        # Get the original column name from NC_VARS mapping
                        original_col = next((col for col, nc_var in NC_VARS.items() 
                                          if nc_var == var_name), None)
                        
                        if original_col and original_col in df.columns:
                            enum_dict = {v: k for k, v in var.datatype.enum_dict.items()}
                            tfm.dfs[group_name][original_col] = df[original_col].map(enum_dict)
                            
                            if self.verbose:
                                print(f"Converted {original_col} to human readable format in {group_name}")

In [None]:
#| eval: false
dfs = load_to_dataframes(fname_in)
tfm = Transformer(
    dfs,
    cbs=[
        ConvertToHumanReadableCB(
            src_fname=fname_in,
            verbose=True
        ),
    ]
)

tfm()


Converted NUCLIDE to human readable format in BIOTA
Converted UNIT to human readable format in BIOTA
Converted DL to human readable format in BIOTA
Converted BIO_GROUP to human readable format in BIOTA
Converted SPECIES to human readable format in BIOTA
Converted BODY_PART to human readable format in BIOTA
Converted NUCLIDE to human readable format in SEAWATER
Converted UNIT to human readable format in SEAWATER
Converted DL to human readable format in SEAWATER
Converted FILT to human readable format in SEAWATER
Converted AREA to human readable format in SEDIMENT
Converted NUCLIDE to human readable format in SEDIMENT
Converted UNIT to human readable format in SEDIMENT
Converted DL to human readable format in SEDIMENT
Converted SED_TYPE to human readable format in SEDIMENT


{'BIOTA':              LON        LAT  SMP_DEPTH        TIME NUCLIDE       VALUE  \
 0      12.316667  54.283333        NaN  1348358400   cs134    0.010140   
 1      12.316667  54.283333        NaN  1348358400     k40  135.300003   
 2      12.316667  54.283333        NaN  1348358400    co60    0.013980   
 3      12.316667  54.283333        NaN  1348358400   cs137    4.338000   
 4      12.316667  54.283333        NaN  1348358400   cs134    0.009614   
 ...          ...        ...        ...         ...     ...         ...   
 14868  19.000000  54.583302       61.0  1519603200   ra226    0.043000   
 14869  15.500000  54.333302       65.0  1518480000     k40   98.000000   
 14870  15.500000  54.333302       65.0  1518480000   cs137    3.690000   
 14871  15.500000  54.333302       65.0  1518480000   ra226    0.049000   
 14872  19.433300  54.363899        NaN  1538524800   cs137    0.830000   
 
              UNIT               DL BIO_GROUP             SPECIES  \
 0      Bq per kgw  

## Standardize Time

In [None]:
#| eval: false
dfs = load_to_dataframes(fname_in)
tfm = Transformer(
    dfs,
    cbs=[
        DecodeTimeCB(),
    ]
)

tfm()

print(tfm.dfs['BIOTA']['TIME'])


0       2012-09-23
1       2012-09-23
2       2012-09-23
3       2012-09-23
4       2012-09-23
           ...    
14868   2018-02-26
14869   2018-02-13
14870   2018-02-13
14871   2018-02-13
14872   2018-10-03
Name: TIME, Length: 14873, dtype: datetime64[ns]


## Review all callbacks

In [None]:
#| eval: false
dfs = load_to_dataframes(fname_in)
tfm = Transformer(
    dfs,
    cbs=[
        ValidateEnumsCB(
            src_fname=fname_in,
            enums=Enums(lut_src_dir=lut_path())
            ),
        ValidateNetCDFVarsCB(
            src_fname=fname_in
            ),            
        ConvertToHumanReadableCB(
            src_fname=fname_in),      
        DecodeTimeCB()
    ]
)

tfm()

print(tfm.dfs['BIOTA'])

             LON        LAT  SMP_DEPTH       TIME NUCLIDE       VALUE  \
0      12.316667  54.283333        NaN 2012-09-23   cs134    0.010140   
1      12.316667  54.283333        NaN 2012-09-23     k40  135.300003   
2      12.316667  54.283333        NaN 2012-09-23    co60    0.013980   
3      12.316667  54.283333        NaN 2012-09-23   cs137    4.338000   
4      12.316667  54.283333        NaN 2012-09-23   cs134    0.009614   
...          ...        ...        ...        ...     ...         ...   
14868  19.000000  54.583302       61.0 2018-02-26   ra226    0.043000   
14869  15.500000  54.333302       65.0 2018-02-13     k40   98.000000   
14870  15.500000  54.333302       65.0 2018-02-13   cs137    3.690000   
14871  15.500000  54.333302       65.0 2018-02-13   ra226    0.049000   
14872  19.433300  54.363899        NaN 2018-10-03   cs137    0.830000   

             UNIT               DL BIO_GROUP             SPECIES  \
0      Bq per kgw  Detection limit      Fish        Gad

In [None]:
tfm.dfs['BIOTA']['TIME'].dtype


dtype('<M8[ns]')

## Decoding NETCDF

In [None]:
#| export
def decode(
    fname_in: str, # Input file name
    dest_out: str, # Output file name
    output_format: str = 'csv',
    remap_vars: Dict[str, str] = OR_VARS,
    **kwargs # Additional arguments
    ) -> None:
    "Decode data from NetCDF."
    dfs = load_to_dataframes(fname_in)
    tfm = Transformer(
        dfs,
        cbs=[
            ValidateEnumsCB(
                src_fname=fname_in,
                enums=Enums(lut_src_dir=lut_path())
                ),
            ValidateNetCDFVarsCB(
                src_fname=fname_in
                ),            
            ConvertToHumanReadableCB(
                src_fname=fname_in),      
            DecodeTimeCB()
        ]
    )    
    
    tfm()
    decoder = NetCDFDecoder( 
                            dfs=dfs,
                            fname_in=fname_in,  
                            dest_out=dest_out,                           
                            output_format='csv',
                            remap_vars=OR_VARS,
                            verbose=False
                    )
    decoder.decode()

In [None]:
#|eval: false
fname = Path('../../_data/output/100-HELCOM-MORS-2024.nc')
decode(fname_in=fname, dest_out=fname.with_suffix(''))

REVIEW time output!!!