In [None]:
#| default_exp handlers.data_format_transformation

# Data format transformation  

> A data pipeline handler that transforms MARIS data from NetCDF to CSV. The primary focus is on converting NetCDF data into MARIS Standard Open-Refine format while preserving data integrity. This handler implements a modular transformation pipeline using callbacks for each processing step, ensuring flexibility and extensibility in data handling.

:::{.callout-tip}

For new MARIS users, please refer to [field definitions
](https://github.com/franckalbinet/marisco/blob/main/nbs/metadata/field-definition.ipynb) for detailed information about Maris fields.

:::

# Dependencies
> Required packages and internal modules for data format transformations

In [None]:
#| export
from pathlib import Path
from netCDF4 import Dataset
import pandas as pd
import fastcore.all as fc
from typing import Dict, Callable, Tuple

from marisco.configs import (
    NC_VARS,
    OR_VARS,
    NC_GROUPS,
    OR_DTYPES,
    Enums,
    lut_path,
    species_lut_path,
    cfg
)

from marisco.utils import (
    ExtractNetcdfContents,
)

from marisco.callbacks import (
    Callback,
    Transformer,
    DecodeTimeCB,
    AddSampleTypeIdColumnCB
)  
    
from marisco.decoders import (
        NetCDFDecoder
    )
from marisco.metadata import (
    ZoteroItem
)

## Configuration and File Paths

In [None]:
#| eval: false
fname_in =  Path('../../_data/output/100-HELCOM-MORS-2024.nc')
fname_out = fname_in.with_suffix('.csv')
output_format = 'openrefine_csv'

## Data Loading

Load data from standardized MARIS NetCDF files using ExtractNetcdfContents. The NetCDF files follow CF conventions and include standardized variable names and metadata according to MARIS specifications.

In [None]:
#| eval: false
contents=ExtractNetcdfContents(fname_in)

Show the dictionary of dataframes extracted from the NetCDF file. 

In [None]:
#| eval: false
contents.dfs

{'BIOTA':              LON        LAT  SMP_DEPTH        TIME  NUCLIDE       VALUE  UNIT  \
 0      12.316667  54.283333        NaN  1348358400       31    0.010140     5   
 1      12.316667  54.283333        NaN  1348358400        4  135.300003     5   
 2      12.316667  54.283333        NaN  1348358400        9    0.013980     5   
 3      12.316667  54.283333        NaN  1348358400       33    4.338000     5   
 4      12.316667  54.283333        NaN  1348358400       31    0.009614     5   
 ...          ...        ...        ...         ...      ...         ...   ...   
 16089  21.395000  61.241501        2.0  1652140800       33   13.700000     4   
 16090  21.395000  61.241501        2.0  1652140800        9    0.500000     4   
 16091  21.385000  61.343334        NaN  1663200000        4   50.700001     4   
 16092  21.385000  61.343334        NaN  1663200000       33    0.880000     4   
 16093  21.385000  61.343334        NaN  1663200000       12    6.600000     4   
 
     

Show the dictionary of enums extracted from the NetCDF file. 

In [None]:
contents.enum_dicts

{'BIOTA': {'nuclide': {'NOT APPLICABLE': '-1',
   'NOT AVAILABLE': '0',
   'h3': '1',
   'be7': '2',
   'c14': '3',
   'k40': '4',
   'cr51': '5',
   'mn54': '6',
   'co57': '7',
   'co58': '8',
   'co60': '9',
   'zn65': '10',
   'sr89': '11',
   'sr90': '12',
   'zr95': '13',
   'nb95': '14',
   'tc99': '15',
   'ru103': '16',
   'ru106': '17',
   'rh106': '18',
   'ag106m': '19',
   'ag108': '20',
   'ag108m': '21',
   'ag110m': '22',
   'sb124': '23',
   'sb125': '24',
   'te129m': '25',
   'i129': '28',
   'i131': '29',
   'cs127': '30',
   'cs134': '31',
   'cs137': '33',
   'ba140': '34',
   'la140': '35',
   'ce141': '36',
   'ce144': '37',
   'pm147': '38',
   'eu154': '39',
   'eu155': '40',
   'pb210': '41',
   'pb212': '42',
   'pb214': '43',
   'bi207': '44',
   'bi211': '45',
   'bi214': '46',
   'po210': '47',
   'rn220': '48',
   'rn222': '49',
   'ra223': '50',
   'ra224': '51',
   'ra225': '52',
   'ra226': '53',
   'ra228': '54',
   'ac228': '55',
   'th227': '56',
 

Show the global attributes extracted from the NetCDF file. 

In [None]:
contents.global_attrs

{'id': '26VMZZ2Q',
 'title': 'Environmental database - Helsinki Commission Monitoring of Radioactive Substances',
 'summary': 'MORS Environment database has been used to collate data resulting from monitoring of environmental radioactivity in the Baltic Sea based on HELCOM Recommendation 26/3.\n\nThe database is structured according to HELCOM Guidelines on Monitoring of Radioactive Substances (https://www.helcom.fi/wp-content/uploads/2019/08/Guidelines-for-Monitoring-of-Radioactive-Substances.pdf), which specifies reporting format, database structure, data types and obligatory parameters used for reporting data under Recommendation 26/3.\n\nThe database is updated and quality assured annually by HELCOM MORS EG.',
 'keywords': 'oceanography, Earth Science > Oceans > Ocean Chemistry> Radionuclides, Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure, Earth Science > Oceans > Ocean Chemistry > Ocean Tracers, Earth Science > Oceans > Marine Sediments, Eart

## Validate NetCDF Enumerations

Verify that enumerated values in the NetCDF file match current MARIS lookup tables.

:::{.callout-tip}

**FEEDBACK TO DATA PROVIDER**: The enumeration validation process is a diagnostic step that identifies inconsistencies between NetCDF enumerations and MARIS lookup tables. While this validation does not modify the dataset, it generates detailed feedback about any mismatches or undefined values. 


:::

In [None]:
#| exports
class ValidateEnumsCB(Callback):
    "Validate enumeration mappings between NetCDF file and MARIS lookup tables."
    def __init__(self, 
                src_fname: str,  # Path to NetCDF file
                enums: Enums,    # MARIS lookup table enums
                verbose: bool = False
                ):
        fc.store_attr()
        
    def __call__(self, tfm: Transformer):
        """Process each group in the NetCDF file and validate its enums."""
        with Dataset(self.src_fname, 'r') as nc:
            for group_name in nc.groups:
                group = nc.groups[group_name]
                self._validate_group(group, group_name)
    
    def _validate_group(self, group, group_name: str):
        """Validate enum mappings for a specific group."""
        for var_name, var in group.variables.items():
            if not hasattr(var.datatype, 'enum_dict'): 
                continue
            
            nc_enum_dict = var.datatype.enum_dict
            if self.verbose:
                print(f"nc_enum_dict [{var_name}]:", nc_enum_dict)

            # Get original column name from NC_VARS mapping
            original_col = next((col for col, nc_var in NC_VARS.items() 
                               if nc_var == var_name), None)
            if not original_col: 
                continue

            # Compare enum mappings
            self._compare_mappings(
                nc_enum_dict,
                self.enums.types[original_col],
                group_name,
                var_name,
                original_col
            )
    
    def _compare_mappings(self, nc_dict: dict, lut_dict: dict, 
                         group_name: str, var_name: str, col_name: str):
        """Compare NetCDF enum dictionary with lookup table dictionary."""
        if self.verbose:
            print(f"lut_enum [{col_name}]:", lut_dict)
            
        # Check for mismatches between NetCDF and lookup table
        for key, value in nc_dict.items():
            if key not in lut_dict or lut_dict[key] != value:
                print(f"\nWarning: Enum mismatch in {group_name}/{var_name}")
                print(f"NetCDF value: {key} -> {value}")
                print(f"Lookup value: {key} -> {lut_dict.get(key, 'Not found')}")        

In [None]:
#| eval: false
contents = ExtractNetcdfContents(fname_in)
tfm = Transformer(
    contents.dfs,
    cbs=[
        ValidateEnumsCB(
            src_fname=fname_in,
            enums=Enums(lut_src_dir=lut_path()),
            #verbose=True
        ),
    ]
)
tfm()
print('\n')





## Remove Non Open Refine Columns 

The ``RemoveNonORVarsCB`` class filters out variables from the NetCDF format that do not align with the requirements for MARIS's OpenRefine data processing.

In [None]:
#| exports
class RemoveNonORVarsCB(Callback):
    "Remove variables not defined in OR_VARS configuration."
    def __init__(self, 
                or_vars: Dict[str, str] = OR_VARS,  # Dictionary mapping OR vars to NC vars
                verbose: bool = False,
                ):
        fc.store_attr()
        
    def __call__(self, tfm: Transformer):
        """Remove non-OR variables from all dataframes."""
        for group_name in tfm.dfs:
            tfm.dfs[group_name] = self._remove_non_or_vars(tfm.dfs[group_name], group_name)
            
    def _remove_non_or_vars(self, df: pd.DataFrame, group_name:str ) -> pd.DataFrame:
        """Remove columns not in OR_VARS and print removed columns if verbose."""
        current_cols = set(df.columns)
        or_cols = set(self.or_vars.keys())
        cols_to_remove = current_cols - or_cols
        
        if self.verbose and cols_to_remove:
            print(f"Removing variables that are not compatible with MARIS's OpenRefine processing. \nRemoving {', '.join(cols_to_remove)} from {group_name} dataset.")
                        
        return df.drop(columns=cols_to_remove)


In [None]:
#| eval: false
contents = ExtractNetcdfContents(fname_in)
tfm = Transformer(
    contents.dfs,
    cbs=[
        RemoveNonORVarsCB(verbose=True),
    ]
)
tfm()
print('\n')

Removing variables that are not compatible with MARIS's OpenRefine processing. 
Removing BIO_GROUP from BIOTA dataset.




## Add Taxon Information

In [None]:
#| exports
TAXON_KEY_MAP = {
    'Taxonname': 'TAXONNAME',
    'Taxonrank': 'TAXONRANK',
    'TaxonDB': 'TAXONDB',
    'TaxonDBID': 'TAXONDBID',
    'TaxonDBURL': 'TAXONDBURL'
}

In [None]:
#| exports
def get_taxon_info_lut(maris_lut: str, key_names: dict = TAXON_KEY_MAP) -> dict:
    "Create lookup dictionary for taxon information from MARIS species lookup table."
    species = pd.read_excel(maris_lut)
    # Select columns and rename them to standardized format
    columns = ['species_id'] + list(key_names.keys())
    df = species[columns].rename(columns=key_names)
    return df.set_index('species_id').to_dict()

lut_taxon = lambda: get_taxon_info_lut(maris_lut=species_lut_path(), key_names=TAXON_KEY_MAP)

In [None]:
#| exports
class AddTaxonInformationCB(Callback):
    """Add taxon information to BIOTA group based on species lookup table."""
    
    def __init__(self, 
                fn_lut: Callable = lut_taxon,  # Function that returns taxon lookup dictionary
                verbose: bool = False
                ):
        fc.store_attr()
        
    def __call__(self, tfm: Transformer):
        """Delegate tasks to add taxon information to the BIOTA group."""
        if not self.check_biota_group_exists(tfm):
            return
        
        df = tfm.dfs['BIOTA']
        if not self.check_species_column_exists(df):
            return
        
        self.add_taxon_columns(df)

    def check_biota_group_exists(self, tfm: Transformer) -> bool:
        """Check if 'BIOTA' group exists in the dataframes."""
        if 'BIOTA' not in tfm.dfs:
            if self.verbose:
                print("No BIOTA group found, skipping taxon information")
            return False
        return True

    def check_species_column_exists(self, df: pd.DataFrame) -> bool:
        """Check if 'SPECIES' column exists in the BIOTA dataframe."""
        if 'SPECIES' not in df.columns:
            if self.verbose:
                print("No SPECIES column found in BIOTA dataframe, skipping taxon information")
            return False
        return True

    def add_taxon_columns(self, df: pd.DataFrame):
        """Add taxon information columns to the BIOTA dataframe."""
        lut = self.fn_lut()
        
        # Add each column from the lookup table
        for col in lut.keys():
            df[col] = df['SPECIES'].map(lut[col]).fillna('Unknown')
        
        self.report_unmatched_species(df)

    def report_unmatched_species(self, df: pd.DataFrame):
        """Report any species IDs not found in the lookup table."""
        unmatched = df[df['TAXONNAME'] == 'Unknown']['SPECIES'].unique()
        if self.verbose and len(unmatched) > 0:
            print(f"Warning: Species IDs not found in lookup table: {', '.join(map(str, unmatched))}")

In [None]:
#| eval: false
contents = ExtractNetcdfContents(fname_in)
tfm = Transformer(
    contents.dfs,
    cbs=[
        AddTaxonInformationCB(
            fn_lut=lut_taxon
        ),
    ]
)

tfm()
print(tfm.dfs['BIOTA'][['TAXONNAME','TAXONRANK','TAXONDB','TAXONDBID','TAXONDBURL']])


               TAXONNAME TAXONRANK   TAXONDB TAXONDBID  \
0           Gadus morhua   species  Wikidata   Q199788   
1           Gadus morhua   species  Wikidata   Q199788   
2           Gadus morhua   species  Wikidata   Q199788   
3           Gadus morhua   species  Wikidata   Q199788   
4           Gadus morhua   species  Wikidata   Q199788   
...                  ...       ...       ...       ...   
16089  Fucus vesiculosus   species  Wikidata   Q754755   
16090  Fucus vesiculosus   species  Wikidata   Q754755   
16091     Mytilus edulis   species  Wikidata    Q27855   
16092     Mytilus edulis   species  Wikidata    Q27855   
16093     Mytilus edulis   species  Wikidata    Q27855   

                                  TAXONDBURL  
0      https://www.wikidata.org/wiki/Q199788  
1      https://www.wikidata.org/wiki/Q199788  
2      https://www.wikidata.org/wiki/Q199788  
3      https://www.wikidata.org/wiki/Q199788  
4      https://www.wikidata.org/wiki/Q199788  
...                  

## Remap to OR mappings

> **Note:** This operation must take place before `ConvertToHumanReadableCB` as it relies on the data being in its encoded state.

RemapToORMappingsCB: Transforms values into OpenRefine-specific formats.

In [None]:
#| exports
or_mappings={'DL':
                {0:'ND',1:'=',2:'<'},
            'FILT':
                {0:'NA',1:'Y',2:'N'},
            }

In [None]:
#| exports
class RemapToORSpecificMappingsCB(Callback):
    "Convert values using OR mappings if columns exist in dataframe."
    def __init__(self, 
                or_mappings: Dict[str, Dict] = or_mappings,  # Dictionary of column mappings, 
                verbose: bool = False
                ):
        fc.store_attr()
        
    def __call__(self, tfm: Transformer):
        """Apply OR mappings to all dataframes."""
        for group_name in tfm.dfs:
            if self.verbose:
                print(f"\nProcessing {group_name} group...")
            tfm.dfs[group_name] = self._apply_mappings(tfm.dfs[group_name])
            
    def _apply_mappings(self, df: pd.DataFrame) -> pd.DataFrame:
        """Apply OR mappings to columns that exist in the dataframe."""
        for col, mapping in self.or_mappings.items():
            if col in df.columns:
                if self.verbose:
                    print(f"    Mapping values for column: {col}")
                df[col] = df[col].map(mapping)
        return df


In [None]:
#| eval: false
contents = ExtractNetcdfContents(fname_in)
tfm = Transformer(
    contents.dfs,
    cbs=[
        RemapToORSpecificMappingsCB(),
    ]
)

tfm()

# Loop through each group in the 'dfs' dictionary
for group_name, df in tfm.dfs.items():
    # Check if the group dataframe contains any of the columns specified in or_mappings.keys()
    relevant_columns = [col for col in or_mappings.keys() if col in df.columns]
    if relevant_columns:
        # Print the unique values from the relevant columns
        print(f"\nUnique values in {group_name} for columns {relevant_columns}:")
        for col in relevant_columns:
            print(f"{col}: {df[col].unique()}")
    else:
        print(f"No relevant columns found in {group_name} based on or_mappings keys.")


Unique values in BIOTA for columns ['DL']:
DL: ['<' '=' 'ND']

Unique values in SEAWATER for columns ['DL', 'FILT']:
DL: ['=' '<' 'ND']
FILT: ['NA' 'N' 'Y']

Unique values in SEDIMENT for columns ['DL']:
DL: ['=' '<' 'ND']


## Remap to human readable 

OR_DTYPES (defined in configs.ipynb) categorizes each OpenRefine variable as 'decoded' or 'encoded', specifying the data format required for processing. Let's review OR_DTYPES:

In [None]:
#| eval: false
with pd.option_context('display.max_columns', None, 'display.max_colwidth', None):
    display(pd.DataFrame.from_dict(OR_DTYPES, orient='index').T)

Unnamed: 0,TIME,AREA,NUCLIDE,UNIT,DL,FILT,COUNT_MET,SAMP_MET,PREP_MET,SPECIES,BODY_PART,SED_TYPE,LAB,PROFILE_ID,SAMPLE_TYPE,TAXONNAME,TAXONREPNAME,TAXONRANK,TAXONDB,TAXONDBID,TaxonDBURL
state,decoded,decoded,encoded,encoded,decoded,decoded,encoded,encoded,encoded,encoded,encoded,encoded,encoded,decoded,decoded,decoded,decoded,decoded,decoded,decoded,decoded


Should i use the enums in the netcdf file or the enums in the marisco package. They shoudl be the same but might become inconsistent over time. 
I chose to use the enums in the marisco package as i feel that small changes to the enum desciption could be implemented in the enums of the marisco package. 




In [None]:
OR_DTYPES

{'TIME': {'state': 'decoded'},
 'AREA': {'state': 'decoded'},
 'NUCLIDE': {'state': 'encoded'},
 'UNIT': {'state': 'encoded'},
 'DL': {'state': 'decoded'},
 'FILT': {'state': 'decoded'},
 'COUNT_MET': {'state': 'encoded'},
 'SAMP_MET': {'state': 'encoded'},
 'PREP_MET': {'state': 'encoded'},
 'SPECIES': {'state': 'encoded'},
 'BODY_PART': {'state': 'encoded'},
 'SED_TYPE': {'state': 'encoded'},
 'LAB': {'state': 'encoded'},
 'PROFILE_ID': {'state': 'decoded'},
 'SAMPLE_TYPE': {'state': 'decoded'},
 'TAXONNAME': {'state': 'decoded'},
 'TAXONREPNAME': {'state': 'decoded'},
 'TAXONRANK': {'state': 'decoded'},
 'TAXONDB': {'state': 'decoded'},
 'TAXONDBID': {'state': 'decoded'},
 'TaxonDBURL': {'state': 'decoded'}}

In [None]:
#| exports
class DataFormatConversionCB(Callback):
    """
    A callback to convert DataFrame enum values between encoded and decoded formats based on specified settings.
    """
    
    def __init__(self, 
                 output_format: str = 'openrefine_csv',  # Desired output format
                 mappings: Dict = or_mappings,  # Dictionary mapping encoded values to human-readable ones
                 verbose: bool = False  # Flag for verbose output
                ):
        fc.store_attr()
        self.dtypes = OR_DTYPES

    def __call__(self, tfm):
        """
        Apply the data format conversion to each DataFrame within the Transformer based on the specified output format.
        """
        allowed_formats = ['decoded_csv', 'encoded_csv', 'openrefine_csv']
        
        if self.output_format not in allowed_formats:
            if self.verbose:
                print(f"Invalid format. Allowed formats: {', '.join(allowed_formats)}.")
            return
        
        self.determine_target_state()
        self.load_enums()
        
        
        print(self.dtypes)
        
        for group_name, df in tfm.dfs.items():
            tfm.dfs[group_name] = self.process_dataframe(group_name, df)

    def determine_target_state(self):
        """
        Determine the target state ('encoded' or 'decoded') based on the output format.
        """
        if self.output_format in ['decoded_csv', 'encoded_csv']:
            state = 'decoded' if self.output_format == 'decoded_csv' else 'encoded'
            for key in self.dtypes:
                self.dtypes[key]['state'] = state
                
        elif self.output_format == 'openrefine_csv':
            self.dtypes = OR_DTYPES

    def load_enums(self):
        """
        Load enums from the lookup path.
        """
        self.enums = Enums(lut_path())
        if self.verbose:
            print(f"Loaded enums: {self.enums.types.keys()}")

    def process_dataframe(self, group_name: str, df: pd.DataFrame):
        """
        Process each DataFrame to convert columns to the target state.
        """
        for column in df.columns:
            if column in self.dtypes and self.dtypes[column]['state'] == 'decoded' and column not in self.mappings.keys() and column != 'TIME':
                if self.verbose:
                    print(f"Decoding column: {column}")
                if column in self.enums.types:
                    # Apply the mapping from encoded to decoded values
                    df[column] = df[column].map(self.enums.types[column])
                else:
                    if self.verbose:
                        print(f"No enum mapping found for column: {column}, skipping decoding.")
        return df

In [None]:
#| eval: false
contents = ExtractNetcdfContents(fname_in)
output_format = 'openrefine_csv'
tfm = Transformer(
    contents.dfs,
    cbs=[
        RemoveNonORVarsCB(),
        DataFormatConversionCB(
            output_format=output_format,
            mappings = or_mappings,
            verbose=True
        ),
    ]
)
tfm()
print('\n')

Loaded enums: dict_keys(['AREA', 'BIO_GROUP', 'BODY_PART', 'COUNT_MET', 'DL', 'FILT', 'NUCLIDE', 'PREP_MET', 'SAMP_MET', 'SED_TYPE', 'SPECIES', 'UNIT', 'LAB'])
{'TIME': {'state': 'decoded'}, 'AREA': {'state': 'decoded'}, 'NUCLIDE': {'state': 'encoded'}, 'UNIT': {'state': 'encoded'}, 'DL': {'state': 'decoded'}, 'FILT': {'state': 'decoded'}, 'COUNT_MET': {'state': 'encoded'}, 'SAMP_MET': {'state': 'encoded'}, 'PREP_MET': {'state': 'encoded'}, 'SPECIES': {'state': 'encoded'}, 'BODY_PART': {'state': 'encoded'}, 'SED_TYPE': {'state': 'encoded'}, 'LAB': {'state': 'encoded'}, 'PROFILE_ID': {'state': 'decoded'}, 'SAMPLE_TYPE': {'state': 'decoded'}, 'TAXONNAME': {'state': 'decoded'}, 'TAXONREPNAME': {'state': 'decoded'}, 'TAXONRANK': {'state': 'decoded'}, 'TAXONDB': {'state': 'decoded'}, 'TAXONDBID': {'state': 'decoded'}, 'TaxonDBURL': {'state': 'decoded'}}




## Standardize Time

HERE: TIME should be encoded or decoded based on config. 


In [None]:
#| eval: false
contents = ExtractNetcdfContents(fname_in)
tfm = Transformer(
    contents.dfs,
    cbs=[
        DecodeTimeCB(),
    ]
)

tfm()

print(tfm.dfs['BIOTA']['TIME'])


0       2012-09-23
1       2012-09-23
2       2012-09-23
3       2012-09-23
4       2012-09-23
           ...    
16089   2022-05-10
16090   2022-05-10
16091   2022-09-15
16092   2022-09-15
16093   2022-09-15
Name: TIME, Length: 16094, dtype: datetime64[ns]


## Add Sample Type ID

In [None]:
#| eval: false
dfs = load_to_dataframes(fname_in)
tfm = Transformer(
    dfs,
    cbs=[
        AddSampleTypeIdColumnCB(),
    ]
)

tfm()
print(tfm.dfs['SEAWATER']['samptype_id'].unique())
print(tfm.dfs['BIOTA']['samptype_id'].unique())
print(tfm.dfs['SEDIMENT']['samptype_id'].unique())


[1]
[2]
[3]


## Add Reference ID


Include the `ref_id` (i.e., Zotero Archive Location) of the Maris data. The `ZoteroArchiveLocationCB` performs a lookup of the Zotero Archive Location based on the `Zotero key` defined in the global attributes of the MARIS NetCDF file as `id`.

In [None]:

#| export
class AddZoteroArchiveLocationCB(Callback):
    "Fetch and append 'Loc. in Archive' from Zotero to DataFrame."
    def __init__(self, src_fname: str, cfg: dict):
        self.src_fname = src_fname
        self.cfg = cfg

    def __call__(self, tfm):
        
        zotero_key = get_netcdf_properties(self.src_fname)['global_attributes']['id']
        item = ZoteroItem(zotero_key, self.cfg['zotero'])
        if item.exist():
            loc_in_archive = item.item['data']['archiveLocation'] 
            for grp, df in tfm.dfs.items():
                df['REF_ID'] = int(loc_in_archive)
        else:
            print(f"Warning: Zotero item {self.item_id} does not exist.")

In [None]:
#| eval: false
dfs = load_to_dataframes(fname_in)
tfm = Transformer(
    dfs,
    cbs=[
        AddZoteroArchiveLocationCB(src_fname=fname_in, cfg=cfg()),
    ]
)
tfm()
print(tfm.dfs['SEAWATER']['REF_ID'].unique())


[100]


## Review all callbacks

In [None]:
#| eval: false
dfs = load_to_dataframes(fname_in)
output_format = 'csv' #'openrefine_csv' # 'csv'

cbs_validation=[ValidateEnumsCB(
            src_fname=fname_in,
            enums=Enums(lut_src_dir=lut_path())
            ),
        ValidateNetCDFVarsCB(
            src_fname=fname_in
            )]

cbs_or=[RemoveNonORVarsCB(),
            RemapToORSpecificMappingsCB(
            or_mappings=or_mappings,
            ),
            
            ]


cbs_general=[AddTaxonInformationCB(
            fn_lut=lut_taxon
            ),            
        RemapToHumanReadableCB(
            src_fname=fname_in, 
            output_format=output_format
            ),
        DecodeTimeCB(),
        AddSampleTypeIdColumnCB(),
        AddZoteroArchiveLocationCB(src_fname=fname_in, cfg=cfg())
    ]


            
            
         
         
         ]


tfm = Transformer(dfs,cbs)  
tfm()
print(tfm.dfs['BIOTA'])

NameError: name 'ValidateEnumsCB' is not defined

In [None]:
tfm.dfs['SEAWATER']['DL'].unique()

array([nan], dtype=object)

## Decoding NETCDF

In [None]:
#| export
def decode(
    fname_in: str, # Input file name
    dest_out: str | None = None, # Output file name (optional)
    output_format: str = 'openrefine_csv',
    remap_vars: Dict[str, str] = OR_VARS,
    verbose: bool = False,
    **kwargs # Additional arguments
    ) -> None:
    "Decode data from NetCDF."
    dfs = load_to_dataframes(fname_in)

    valid_output_formats=['openrefine_csv', 'csv']
    if output_format not in valid_output_formats:
        print (f'Invalid output format. Allowed formats: {valid_output_formats}')
        return 
    
    tfm = Transformer(
        dfs,
        cbs=[
            RemoveNonORVarsCB(
                output_format=output_format
                ),
            ValidateEnumsCB(
                src_fname=fname_in,
                enums=Enums(lut_src_dir=lut_path())
                ),
            ValidateNetCDFVarsCB(
                src_fname=fname_in
                ),
            
            AddTaxonInformationCB(
                fn_lut=lut_taxon
                ),  
            RemapToORMappingsCB(
                or_mappings=or_mappings,
                output_format=output_format
                ),            
            RemapToHumanReadableCB(
                src_fname=fname_in, 
                output_format=output_format
                ),
            DecodeTimeCB(),
            AddSampleTypeIdColumnCB(),
            AddZoteroArchiveLocationCB(src_fname=fname_in, cfg=cfg())
        ]
    )    
    
    tfm()
    decoder = NetCDFDecoder( 
                            dfs=tfm.dfs,
                            fname_in=fname_in,  
                            dest_out=dest_out,                           
                            output_format='csv',
                            remap_vars=OR_VARS,
                            verbose=verbose
                    )
    decoder.decode()

In [None]:
#|eval: false
fname = Path('../../_data/output/100-HELCOM-MORS-2024.nc')
decode(fname_in=fname, dest_out=fname.with_suffix(''))

In [None]:
decode(fname_in=fname, dest_out=fname.with_suffix(''))