In [None]:
#| default_exp netcdf_to_refine

# Convert MARIS NetCDF to OpenRefine CSV

> A data pipeline that converts MARIS NetCDF files into MARIS Standard OpenRefine CSV format.

This module converts NetCDF files into CSV files that follow the MARIS Standard [OpenRefine](https://openrefine.org/) format. While MARISCO has replaced OpenRefine in the data cleaning and preparation pipeline, the MARIS master database still requires input files to conform to this CSV format specification. The conversion is performed using the marisco library.


:::{.callout-tip}

For new MARIS users, please refer to [field definitions
](https://github.com/franckalbinet/marisco/blob/main/nbs/metadata/field-definition.ipynb) for detailed information about Maris fields.

:::

# Dependencies
> Required packages and internal modules for data format transformations

In [None]:
#| export
from pathlib import Path
# from netCDF4 import Dataset
import pandas as pd
import fastcore.all as fc
from typing import Dict,Callable

from marisco.configs import (
    NC_VARS,
    CSV_VARS,
    CSV_DTYPES,
    Enums,
    lut_path,
    species_lut_path,
    detection_limit_lut_path, # used for feedback. 
    filtered_lut_path,
    cfg
)

from marisco.utils import (
    ExtractNetcdfContents,
)

from marisco.callbacks import (
    Callback,
    Transformer,
    DecodeTimeCB,
    AddSampleTypeIdColumnCB
)  
    
from marisco.decoders import (
    NetCDFDecoder
    )
from marisco.metadata import (
    ZoteroItem
)

In [None]:
#| eval: false
from IPython.display import display, Markdown

## Configuration and File Paths

In [None]:
#| eval: false
fname_in =  Path('../../_data/output/100-HELCOM-MORS-2024.nc')
fname_out = fname_in.with_suffix('.csv')
output_format = 'openrefine_csv'

## Data Loading

Load data from standardized MARIS NetCDF files using ExtractNetcdfContents. The NetCDF files follow CF conventions and include standardized variable names and metadata according to MARIS specifications.

In [None]:
#| eval: false
contents = ExtractNetcdfContents(fname_in)

Show the dictionary of dataframes extracted from the NetCDF file. 

In [None]:
#| eval: false
contents.dfs.keys()

dict_keys(['BIOTA', 'SEAWATER', 'SEDIMENT'])

Show an example of the DataFrame extracted from the NetCDF file. 

In [None]:
#| eval: false
with pd.option_context('display.max_columns', None):
    display(contents.dfs['SEAWATER'].head())

Unnamed: 0,LON,LAT,SMP_DEPTH,TOT_DEPTH,TIME,SMP_ID,NUCLIDE,VALUE,UNIT,UNC,DL,FILT
0,29.3333,60.083302,0.0,,1337731200,0,33,5.3,1,1.696,1,0
1,29.3333,60.083302,29.0,,1337731200,1,33,19.9,1,3.98,1,0
2,23.15,59.4333,0.0,,1339891200,2,33,25.5,1,5.1,1,0
3,27.983299,60.25,0.0,,1337817600,3,33,17.0,1,4.93,1,0
4,27.983299,60.25,39.0,,1337817600,4,33,22.200001,1,3.996,1,0


Show an example of the dictionary of enums extracted from the NetCDF file as a DataFrame. 

In [None]:
#| eval: false
grp='SEAWATER'
print(f'Variables in {grp} group: {contents.enum_dicts[grp].keys()}')
var='nuclide'
with pd.option_context('display.max_columns', None):
    display(pd.DataFrame.from_dict(contents.enum_dicts[grp][var], orient='index').T)

Variables in SEAWATER group: dict_keys(['nuclide', 'unit', 'dl', 'filt'])


Unnamed: 0,NOT APPLICABLE,NOT AVAILABLE,h3,be7,c14,k40,cr51,mn54,co57,co58,co60,zn65,sr89,sr90,zr95,nb95,tc99,ru103,ru106,rh106,ag106m,ag108,ag108m,ag110m,sb124,sb125,te129m,i129,i131,cs127,cs134,cs137,ba140,la140,ce141,ce144,pm147,eu154,eu155,pb210,pb212,pb214,bi207,bi211,bi214,po210,rn220,rn222,ra223,ra224,ra225,ra226,ra228,ac228,th227,th228,th232,th234,pa234,u234,u235,u238,np237,np239,pu238,pu239,pu240,pu241,am240,am241,cm242,cm243,cm244,cs134_137_tot,pu239_240_tot,pu239_240_iii_iv_tot,pu239_240_v_vi_tot,cm243_244_tot,pu238_pu239_240_tot_ratio,am241_pu239_240_tot_ratio,cs137_134_ratio,cd109,eu152,fe59,gd153,ir192,pu238_240_tot,rb86,sc46,sn113,sn117m,tl208,mo99,tc99m,ru105,te129,te132,i132,i135,cs136,tbeta,talpha,i133,th230,pa231,u236,ag111,in116m,te123m,sb127,ba133,ce139,tl201,hg203,na22,pa234m,am243,se75,sr85,y88,ce140,bi212,u236_238_ratio,i125,ba137m,u232,pa233,ru106_rh106_tot,tu,tbeta40k,fe55,ce144_pr144_tot,pu240_pu239_ratio,u233,pu239_242_tot,ac227
0,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,28,29,30,31,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,116,117,122,123,124,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144


Show the global attributes extracted from the NetCDF file. 

In [None]:
#| eval: false
print("First few attributes from global attributes:", list(contents.global_attrs.items())[:5])

First few attributes from global attributes: [('id', '26VMZZ2Q'), ('title', 'Environmental database - Helsinki Commission Monitoring of Radioactive Substances'), ('summary', 'MORS Environment database has been used to collate data resulting from monitoring of environmental radioactivity in the Baltic Sea based on HELCOM Recommendation 26/3.\n\nThe database is structured according to HELCOM Guidelines on Monitoring of Radioactive Substances (https://www.helcom.fi/wp-content/uploads/2019/08/Guidelines-for-Monitoring-of-Radioactive-Substances.pdf), which specifies reporting format, database structure, data types and obligatory parameters used for reporting data under Recommendation 26/3.\n\nThe database is updated and quality assured annually by HELCOM MORS EG.'), ('keywords', 'oceanography, Earth Science > Oceans > Ocean Chemistry> Radionuclides, Earth Science > Human Dimensions > Environmental Impacts > Nuclear Radiation Exposure, Earth Science > Oceans > Ocean Chemistry > Ocean Tracers

Show the custom maps extracted from the NetCDF file. 

In [None]:
#| eval: false
grp='SEAWATER'
print(f'Custom maps in {grp} group: {contents.custom_maps[grp].keys()}')
with pd.option_context('display.max_columns', None):
    display(pd.DataFrame.from_dict(contents.custom_maps[grp], orient='index'))   

Custom maps in SEAWATER group: dict_keys([])


## Validate NetCDF Enumerations

Verify that enumerated values in the NetCDF file match current MARIS lookup tables.

:::{.callout-important}
## FEEDBACK TO DATA PROVIDERS

The enumeration validation process is a diagnostic step that identifies inconsistencies between NetCDF enumerations and MARIS lookup tables. While this validation does not modify the dataset, it generates detailed feedback about any mismatches or undefined values. 


:::

In [None]:
#| export
class ValidateEnumsCB(Callback):
    "Validate enumeration mappings between NetCDF file and MARIS lookup tables."

    def __init__(self, contents, maris_enums, verbose=False):
        fc.store_attr()

    def __call__(self, tfm):
        for group_name, enum_dict in self.contents.enum_dicts.items():
            self._validate_group(group_name, enum_dict)

    def _validate_group(self, group_name, enum_dict):
        
        for var_name, nc_enum_dict in enum_dict.items():
            if self.verbose:
                print(f"Validating variable {var_name} from NetCDF group {group_name}.")
            var_name = self._get_original_var_name(var_name)
            if self.verbose:
                print(f"Standardized variable name to MARISCO naming convention: {var_name}")

            if var_name not in self.maris_enums.types:
                if self.verbose:
                    print(f"Variable {var_name} not found in MARISCO enums.")
                continue

            self._compare_mappings(nc_enum_dict, self.maris_enums.types[var_name], group_name, var_name)

    def _get_original_var_name(self, var_name):
        return next((var for var, nc_var in NC_VARS.items() if nc_var == var_name), var_name)

    def _compare_mappings(self, nc_dict, maris_enum, group_name, var_name):        
        for key, value in nc_dict.items():
            value=int(value)
            if key not in maris_enum or maris_enum[key] != value:
                print(f"\nWarning: Enum mismatch: {var_name} in {group_name}.")
                print(f"   NetCDF value: {key} -> {value}")
                print(f"   MARISCO standard enum lookup value: {key} -> {maris_enum.get(key, 'Not found')}")
            

In [None]:
#| eval: false
contents = ExtractNetcdfContents(fname_in)
tfm = Transformer(
    data= contents.dfs,
    custom_maps=contents.custom_maps,
    cbs=[
        ValidateEnumsCB(
            contents = contents,
            maris_enums=Enums(lut_src_dir=lut_path())
        ),
    ]
)
tfm()
print('\n')





## Remove Non Compatible Columns 

The ``RemoveNonCompatibleVariablesCB`` callback filters out variables from the NetCDF format that are not listed in the VARS configuration. 

In [None]:
#| export
class RemoveNonCompatibleVariablesCB(Callback):
    "Remove variables not listed in VARS configuration."
    def __init__(self, 
                vars: Dict[str, str] = CSV_VARS,  # Dictionary mapping OR vars to NC vars
                verbose: bool = False,
                ):
        fc.store_attr()
        
    def __call__(self, tfm: Transformer):
        """Remove non-OR variables from all dataframes."""
        for group_name in tfm.dfs:
            tfm.dfs[group_name] = self._remove_non_vars(tfm.dfs[group_name], group_name)
            
    def _remove_non_vars(self, df: pd.DataFrame, group_name:str ) -> pd.DataFrame:
        """Remove variables not in vars and print removed columns if verbose."""
        current_cols = set(df.columns)
        vars_cols = set(self.vars.keys())
        cols_to_remove = current_cols - vars_cols
        
        if self.verbose and cols_to_remove:
            print(f"Removing variables that are not compatible with vars provided. \nRemoving {', '.join(cols_to_remove)} from {group_name} dataset.")
                        
        return df.drop(columns=cols_to_remove)


In [None]:
#| eval: false
contents = ExtractNetcdfContents(fname_in)
tfm = Transformer(
    data=contents.dfs,
    custom_maps=contents.custom_maps,
    cbs=[
        RemoveNonCompatibleVariablesCB(vars=CSV_VARS, verbose=True),
    ]
)
tfm()
print('\n')

Removing variables that are not compatible with vars provided. 
Removing BIO_GROUP from BIOTA dataset.




## Add Taxon Information

In [None]:
#| export
TAXON_MAP = {
    'Taxonname': 'TAXONNAME',
    'Taxonrank': 'TAXONRANK',
    'TaxonDB': 'TAXONDB',
    'TaxonDBID': 'TAXONDBID',
    'TaxonDBURL': 'TAXONDBURL'
}

In [None]:
#| export
def get_taxon_info_lut(maris_lut: str, key_names: dict = TAXON_MAP) -> dict:
    "Create lookup dictionary for taxon information from MARIS species lookup table."
    species = pd.read_excel(maris_lut)
    # Select columns and rename them to standardized format
    columns = ['species_id'] + list(key_names.keys())
    df = species[columns].rename(columns=key_names)
    return df.set_index('species_id').to_dict()

lut_taxon = lambda: get_taxon_info_lut(maris_lut=species_lut_path(), key_names=TAXON_MAP)

In [None]:
#| export
class AddTaxonInformationCB(Callback):
    """Add taxon information to BIOTA group based on species lookup table."""
    
    def __init__(self, 
                fn_lut: Callable = lut_taxon,  # Function that returns taxon lookup dictionary
                verbose: bool = False
                ):
        fc.store_attr()
        
    def __call__(self, tfm: Transformer):
        """Delegate tasks to add taxon information to the BIOTA group."""
        if not self.check_biota_group_exists(tfm):
            return
        
        df = tfm.dfs['BIOTA']
        if not self.check_species_column_exists(df):
            return
        
        self.add_taxon_columns(df)

    def check_biota_group_exists(self, tfm: Transformer) -> bool:
        """Check if 'BIOTA' group exists in the dataframes."""
        if 'BIOTA' not in tfm.dfs:
            if self.verbose:
                print("No BIOTA group found, skipping taxon information")
            return False
        return True

    def check_species_column_exists(self, df: pd.DataFrame) -> bool:
        """Check if 'SPECIES' column exists in the BIOTA dataframe."""
        if 'SPECIES' not in df.columns:
            if self.verbose:
                print("No SPECIES column found in BIOTA dataframe, skipping taxon information")
            return False
        return True

    def add_taxon_columns(self, df: pd.DataFrame):
        """Add taxon information columns to the BIOTA dataframe."""
        lut = self.fn_lut()
        
        # Add each column from the lookup table
        for col in lut.keys():
            df[col] = df['SPECIES'].map(lut[col]).fillna('Unknown')
        
        self.report_unmatched_species(df)

    def report_unmatched_species(self, df: pd.DataFrame):
        """Report any species IDs not found in the lookup table."""
        unmatched = df[df['TAXONNAME'] == 'Unknown']['SPECIES'].unique()
        if self.verbose and len(unmatched) > 0:
            print(f"Warning: Species IDs not found in lookup table: {', '.join(map(str, unmatched))}")

In [None]:
#| eval: false
contents = ExtractNetcdfContents(fname_in)
tfm = Transformer(
    data=contents.dfs,
    custom_maps=contents.custom_maps,
    cbs=[
        AddTaxonInformationCB(
            fn_lut=lut_taxon
        ),
    ]
)

tfm()
print(tfm.dfs['BIOTA'][['TAXONNAME','TAXONRANK','TAXONDB','TAXONDBID','TAXONDBURL']])


               TAXONNAME TAXONRANK   TAXONDB TAXONDBID  \
0           Gadus morhua   species  Wikidata   Q199788   
1           Gadus morhua   species  Wikidata   Q199788   
2           Gadus morhua   species  Wikidata   Q199788   
3           Gadus morhua   species  Wikidata   Q199788   
4           Gadus morhua   species  Wikidata   Q199788   
...                  ...       ...       ...       ...   
16089  Fucus vesiculosus   species  Wikidata   Q754755   
16090  Fucus vesiculosus   species  Wikidata   Q754755   
16091     Mytilus edulis   species  Wikidata    Q27855   
16092     Mytilus edulis   species  Wikidata    Q27855   
16093     Mytilus edulis   species  Wikidata    Q27855   

                                  TAXONDBURL  
0      https://www.wikidata.org/wiki/Q199788  
1      https://www.wikidata.org/wiki/Q199788  
2      https://www.wikidata.org/wiki/Q199788  
3      https://www.wikidata.org/wiki/Q199788  
4      https://www.wikidata.org/wiki/Q199788  
...                  

## Standardize Time

In [None]:
#| eval: false
contents = ExtractNetcdfContents(fname_in)
tfm = Transformer(
    data=contents.dfs,
    custom_maps=contents.custom_maps,
    cbs=[
        DecodeTimeCB(),
    ]
)

tfm()

print(tfm.dfs['BIOTA']['TIME'])


0       2012-09-23
1       2012-09-23
2       2012-09-23
3       2012-09-23
4       2012-09-23
           ...    
16089   2022-05-10
16090   2022-05-10
16091   2022-09-15
16092   2022-09-15
16093   2022-09-15
Name: TIME, Length: 16094, dtype: datetime64[ns]


## Add Sample Type ID

In [None]:
#| eval: false
contents = ExtractNetcdfContents(fname_in)
tfm = Transformer(
    data=contents.dfs,
    custom_maps=contents.custom_maps,
    cbs=[
        AddSampleTypeIdColumnCB(),
    ]
)

tfm()
print(tfm.dfs['SEAWATER']['SAMPLE_TYPE'].unique())
print(tfm.dfs['BIOTA']['SAMPLE_TYPE'].unique())
print(tfm.dfs['SEDIMENT']['SAMPLE_TYPE'].unique())


[1]
[2]
[3]


## Add Reference ID


Include the `ref_id` (i.e., Zotero Archive Location). The `ZoteroArchiveLocationCB` performs a lookup of the Zotero Archive Location based on the `Zotero key` defined in the global attributes of the MARIS NetCDF file as `id`.

In [None]:
#| eval: false
contents.global_attrs['id']

'26VMZZ2Q'

In [None]:
#| export
class AddZoteroArchiveLocationCB(Callback):
    "Fetch and append 'Loc. in Archive' from Zotero to DataFrame."
    def __init__(self, attrs: str, cfg: dict):
        fc.store_attr()

    def __call__(self, tfm):
        
        zotero_key = self.attrs['id']
        item = ZoteroItem(zotero_key, self.cfg['zotero'])
        if item.exist():
            loc_in_archive = item.item['data']['archiveLocation'] 
            for grp, df in tfm.dfs.items():
                df['REF_ID'] = int(loc_in_archive)
        else:
            print(f"Warning: Zotero item {self.item_id} does not exist.")

In [None]:
#| eval: false
contents = ExtractNetcdfContents(fname_in)
tfm = Transformer(
    data=contents.dfs,
    custom_maps=contents.custom_maps,
    cbs=[
        AddZoteroArchiveLocationCB(contents.global_attrs, cfg=cfg()),
    ]
)
tfm()
print(tfm.dfs['SEAWATER']['REF_ID'].unique())


[100]


## Remap encoded custom maps

NetCDF variables can store custom mappings as attributes, which provide a way to map between encoded values and their human-readable representations. The `RemapCustomMapsCB` callback handles this conversion process.

In [None]:
#| export
class RemapCustomMapsCB(Callback):
    "Remap encoded custom maps to decoded values."
    def __init__(self, verbose: bool = False):
        fc.store_attr()
        
    def __call__(self, tfm):
        """Remap encoded custom maps to decoded values."""
        
        for grp in tfm.dfs:
            for var in tfm.dfs[grp].columns:
                if var in tfm.custom_maps[grp]:
                    if self.verbose:
                        print(f"Remapping {var} from {grp} group")
                    
                    # Convert column to int type to ensure proper mapping
                    tfm.dfs[grp][var] = tfm.dfs[grp][var].astype(int)
                    
                    # Create reverse mapping dictionary
                    reverse_custom_map = {int(v): k for k, v in tfm.custom_maps[grp][var].items()}
                    
                    # Apply mapping
                    tfm.dfs[grp][var] = tfm.dfs[grp][var].map(reverse_custom_map)

In [None]:
#| eval: false
contents = ExtractNetcdfContents(fname_in)
tfm = Transformer(
    data=contents.dfs,
    custom_maps=contents.custom_maps,
    cbs=[
        RemapCustomMapsCB(verbose=True),
    ]
)
tfm()
print('Example of remapped custom maps:')
for grp in tfm.dfs:
    print(grp)
    print(tfm.dfs[grp]['SMP_ID'].head())

Example of remapped custom maps:
BIOTA
0    0
1    0
2    0
3    0
4    1
Name: SMP_ID, dtype: uint64
SEAWATER
0    0
1    1
2    2
3    3
4    4
Name: SMP_ID, dtype: uint64
SEDIMENT
0    0
1    1
2    2
3    3
4    4
Name: SMP_ID, dtype: uint64


## Remap to Open Refine specific mappings

:::{.callout-warning}
## FEEDBACK FOR NEXT VERSION 

[To be further clarified]

The current approach of remapping to OR-specific mappings should be reconsidered. Considering that we already utilize MARISCO lookup tables in NetCDF for creating enums, it would be beneficial to extend their use to OpenRefine data formats as well. By doing so, we could eliminate the need for OpenRefine-specific mappings, streamlining the data transformation process. Lets review the lookup tables used to create the enums for NetCDF:

:::

In [None]:
#| eval: false
enums = Enums(lut_src_dir=lut_path())
print(f'DL enums: {enums.types["DL"]}')
print(f'FILT enums: {enums.types["FILT"]}')

DL enums: {'Not applicable': -1, 'Not available': 0, 'Detected value': 1, 'Detection limit': 2, 'Not detected': 3, 'Derived': 4}
FILT enums: {'Not applicable': -1, 'Not available': 0, 'Yes': 1, 'No': 2}


For the detection limit lookup table (LUT), as shown below, the values required for the OpenRefine CSV format are listed under the 'name' column, whereas the enums utilize the 'name_sanitized' column. Additionally, for the filtered LUT, also shown below, the values do not align consistently with the OpenRefine CSV format, which uses (`Y`, `N`, `NA`).

In [None]:
#| eval: false
dl_lut = pd.read_excel(detection_limit_lut_path())
dl_lut

Unnamed: 0,id,name,name_sanitized
0,-1,Not applicable,Not applicable
1,0,Not Available,Not available
2,1,=,Detected value
3,2,<,Detection limit
4,3,ND,Not detected
5,4,DE,Derived


In [None]:
#| eval: false
filtered_lut = pd.read_excel(filtered_lut_path())
filtered_lut

Unnamed: 0,id,name
0,-1,Not applicable
1,0,Not available
2,1,Yes
3,2,No


We will create OpenRefine specific mappings for the detection limit and filtered data:

In [None]:
#| export
or_mappings={'DL':
                {0:'ND',1:'=',2:'<'},
            'FILT':
                {0:'NA',1:'Y',2:'N'},
            }

RemapToORSpecificMappingsCB remaps the values of the detection limit and filtered data to the OpenRefine CSV format. 

In [None]:
#| export
class RemapToORSpecificMappingsCB(Callback):
    "Convert values using OR mappings if columns exist in dataframe."
    def __init__(self, 
                or_mappings: Dict[str, Dict] = or_mappings,  # Dictionary of column mappings, 
                output_format: str = 'openrefine_csv',
                verbose: bool = False
                ):
        fc.store_attr()
        
    def __call__(self, tfm: Transformer):
        """Apply OR mappings to all dataframes."""
        for group_name in tfm.dfs:
            if self.verbose:
                print(f"\nProcessing {group_name} group...")
            tfm.dfs[group_name] = self._apply_mappings(tfm.dfs[group_name])
            
    def _apply_mappings(self, df: pd.DataFrame) -> pd.DataFrame:
        """Apply OR mappings to columns that exist in the dataframe."""
        for col, mapping in self.or_mappings.items():
            if col in df.columns:
                if self.verbose:
                    print(f"    Mapping values for column: {col}")
                df[col] = df[col].map(mapping)
        return df


In [None]:
#| eval: false
contents = ExtractNetcdfContents(fname_in)
tfm = Transformer(
    data= contents.dfs,
    custom_maps=contents.custom_maps,
    cbs=[
        RemapToORSpecificMappingsCB(),
    ]
)

tfm()

# Loop through each group in the 'dfs' dictionary
for group_name, df in tfm.dfs.items():
    # Check if the group dataframe contains any of the columns specified in or_mappings.keys()
    relevant_columns = [col for col in or_mappings.keys() if col in df.columns]
    if relevant_columns:
        # Print the unique values from the relevant columns
        print(f"\nUnique values in {group_name} for columns {relevant_columns}:")
        for col in relevant_columns:
            print(f"{col}: {df[col].unique()}")
    else:
        print(f"No relevant columns found in {group_name} based on or_mappings keys.")


Unique values in BIOTA for columns ['DL']:
DL: ['<' '=' 'ND']

Unique values in SEAWATER for columns ['DL', 'FILT']:
DL: ['=' '<' 'ND']
FILT: ['NA' 'N' 'Y']

Unique values in SEDIMENT for columns ['DL']:
DL: ['=' '<' 'ND']


## Remap to CSV data type format

`CSV_DTYPES` (defined in configs.ipynb) defines a state for each variable that contains a lookup table (i.e. enums). The state is either 'decoded' or 'encoded'. Lets review the variable states as a DataFrame:

In [None]:
#| eval: false
with pd.option_context('display.max_columns', None, 'display.max_colwidth', None):
    display(pd.DataFrame.from_dict(CSV_DTYPES, orient='index').T)

Unnamed: 0,AREA,NUCLIDE,UNIT,DL,FILT,COUNT_MET,SAMP_MET,PREP_MET,SPECIES,BODY_PART,SED_TYPE,LAB
state,decoded,encoded,encoded,decoded,decoded,encoded,encoded,encoded,encoded,encoded,encoded,encoded


In [None]:
#| eval: false
enums = Enums(lut_src_dir=lut_path())
enums.types.keys()

dict_keys(['AREA', 'BIO_GROUP', 'BODY_PART', 'COUNT_MET', 'DL', 'FILT', 'NUCLIDE', 'PREP_MET', 'SAMP_MET', 'SED_TYPE', 'SPECIES', 'UNIT', 'LAB'])

In [None]:
#| export
def get_excluded_enums(output_format: str = 'openrefine_csv') -> dict:
    "Get excluded enums based on output format."
    return or_mappings if output_format == 'openrefine_csv' else {}

In [None]:
#| export
class DataFormatConversionCB(Callback):
    """
    A callback to convert DataFrame enum values between encoded and decoded formats based on specified settings.
    """

    def __init__(self, 
                 dtypes: Dict,  # Dictionary defining data types and states for each lookup table
                 excluded_mappings: Callable = get_excluded_enums,  # Dictionary of columns to exclude from conversion
                 output_format: str = 'openrefine_csv',
                 verbose: bool = False  # Flag for verbose output
                ):
        fc.store_attr()

    def __call__(self, tfm):
        """
        Apply the data format conversion to each DataFrame within the Transformer.
        """
        self.load_enums()
        
        for group_name, df in tfm.dfs.items():
            tfm.dfs[group_name] = self.process_dataframe(group_name, df)

    def load_enums(self):
        """
        Load enums from the lookup path.
        """
        self.enums = Enums(lut_path())
        if self.verbose:
            print(f"Loaded enums: {self.enums.types.keys()}")

    def process_dataframe(self, group_name: str, df: pd.DataFrame):
        """
        Process each DataFrame to convert columns to the target state.
        """
        for column in df.columns:
            if column in self.dtypes and column not in self.excluded_mappings(self.output_format):
                if self.dtypes[column]['state'] == 'decoded':
                    if self.verbose:
                        print(f"Decoding column: {column}")
                    if column in self.enums.types:
                        # Apply the mapping from encoded to decoded values
                        df[column] = df[column].map(self.enums.types[column])
                        if self.verbose:
                            print(f"Decoded column: {column}")
                    else:
                        if self.verbose:
                            print(f"No enum mapping found for column: {column}, skipping decoding.")
        return df

In [None]:
#| eval: false
contents = ExtractNetcdfContents(fname_in)
tfm = Transformer(
    contents.dfs,
    cbs=[
        RemoveNonCompatibleVariablesCB(vars=CSV_VARS, verbose=True),
        DataFormatConversionCB(
            dtypes=CSV_DTYPES,
            excluded_mappings = get_excluded_enums,
            output_format='openrefine_csv',
            verbose=True
        ),
    ]
)
tfm()

Removing variables that are not compatible with vars provided. 
Removing BIO_GROUP from BIOTA dataset.
Loaded enums: dict_keys(['AREA', 'BIO_GROUP', 'BODY_PART', 'COUNT_MET', 'DL', 'FILT', 'NUCLIDE', 'PREP_MET', 'SAMP_MET', 'SED_TYPE', 'SPECIES', 'UNIT', 'LAB'])


{'BIOTA':              LON        LAT  SMP_DEPTH        TIME  SMP_ID  NUCLIDE  \
 0      12.316667  54.283333        NaN  1348358400       0       31   
 1      12.316667  54.283333        NaN  1348358400       0        4   
 2      12.316667  54.283333        NaN  1348358400       0        9   
 3      12.316667  54.283333        NaN  1348358400       0       33   
 4      12.316667  54.283333        NaN  1348358400       1       31   
 ...          ...        ...        ...         ...     ...      ...   
 16089  21.395000  61.241501        2.0  1652140800    4789       33   
 16090  21.395000  61.241501        2.0  1652140800    4789        9   
 16091  21.385000  61.343334        NaN  1663200000    4790        4   
 16092  21.385000  61.343334        NaN  1663200000    4790       33   
 16093  21.385000  61.343334        NaN  1663200000    4790       12   
 
             VALUE  UNIT       UNC  DL  SPECIES  BODY_PART       DRYWT  WETWT  \
 0        0.010140     5       NaN   2      

## Review all callbacks

In [None]:
#| eval: false
contents = ExtractNetcdfContents(fname_in)
output_format = 'openrefine_csv'
tfm = Transformer(
    data=contents.dfs,
    custom_maps=contents.custom_maps,
    cbs=[
        ValidateEnumsCB(
            contents = contents,
            maris_enums=Enums(lut_src_dir=lut_path())
        ),
        RemoveNonCompatibleVariablesCB(vars=CSV_VARS) ,
        RemapCustomMapsCB(),
        RemapToORSpecificMappingsCB(output_format=output_format),
        AddTaxonInformationCB(
            fn_lut=lut_taxon
        ),
        DecodeTimeCB(),
        AddSampleTypeIdColumnCB(),
        AddZoteroArchiveLocationCB(contents.global_attrs, cfg=cfg()),
        DataFormatConversionCB(
            dtypes=CSV_DTYPES,
            excluded_mappings = get_excluded_enums,
            output_format=output_format,
        ) 
        ]
)
tfm()
for grp in ['SEAWATER', 'BIOTA']:
    display(Markdown(f"<b>Head of the transformed `{grp}` DataFrame:</b>"))
    with pd.option_context('display.max_rows', None):
        display(tfm.dfs[grp].head())

<b>Head of the transformed `SEAWATER` DataFrame:</b>

Unnamed: 0,LON,LAT,SMP_DEPTH,TOT_DEPTH,TIME,SMP_ID,NUCLIDE,VALUE,UNIT,UNC,DL,FILT,SAMPLE_TYPE,REF_ID
0,29.3333,60.083302,0.0,,2012-05-23,0,33,5.3,1,1.696,=,,1,100
1,29.3333,60.083302,29.0,,2012-05-23,1,33,19.9,1,3.98,=,,1,100
2,23.15,59.4333,0.0,,2012-06-17,2,33,25.5,1,5.1,=,,1,100
3,27.983299,60.25,0.0,,2012-05-24,3,33,17.0,1,4.93,=,,1,100
4,27.983299,60.25,39.0,,2012-05-24,4,33,22.200001,1,3.996,=,,1,100


<b>Head of the transformed `BIOTA` DataFrame:</b>

Unnamed: 0,LON,LAT,SMP_DEPTH,TIME,SMP_ID,NUCLIDE,VALUE,UNIT,UNC,DL,...,DRYWT,WETWT,PERCENTWT,TAXONNAME,TAXONRANK,TAXONDB,TAXONDBID,TAXONDBURL,SAMPLE_TYPE,REF_ID
0,12.316667,54.283333,,2012-09-23,0,31,0.01014,5,,<,...,174.934433,948.0,0.18453,Gadus morhua,species,Wikidata,Q199788,https://www.wikidata.org/wiki/Q199788,2,100
1,12.316667,54.283333,,2012-09-23,0,4,135.300003,5,4.83021,=,...,174.934433,948.0,0.18453,Gadus morhua,species,Wikidata,Q199788,https://www.wikidata.org/wiki/Q199788,2,100
2,12.316667,54.283333,,2012-09-23,0,9,0.01398,5,,<,...,174.934433,948.0,0.18453,Gadus morhua,species,Wikidata,Q199788,https://www.wikidata.org/wiki/Q199788,2,100
3,12.316667,54.283333,,2012-09-23,0,33,4.338,5,0.150962,=,...,174.934433,948.0,0.18453,Gadus morhua,species,Wikidata,Q199788,https://www.wikidata.org/wiki/Q199788,2,100
4,12.316667,54.283333,,2012-09-23,1,31,0.009614,5,,<,...,177.93512,964.0,0.18458,Gadus morhua,species,Wikidata,Q199788,https://www.wikidata.org/wiki/Q199788,2,100


## Decode   

In [None]:
#| export
def decode(
    fname_in: str, # Input file name
    dest_out: str | None = None, # Output file name (optional)
    output_format: str = 'openrefine_csv',
    remap_vars: Dict[str, str] = CSV_VARS,
    remap_dtypes: Dict[str, str] = CSV_DTYPES,
    verbose: bool = False,
    **kwargs # Additional arguments
    ) -> None:
    "Decode data from NetCDF."
    valid_output_formats=['openrefine_csv', 'decoded_csv']
    
    if output_format not in valid_output_formats:
        print (f'Invalid output format. Allowed formats: {valid_output_formats}')
        return 
    
    if output_format == 'decoded_csv':
        remap_dtypes = {k: {'state': 'decoded'} for k in remap_dtypes.keys()}
        
    contents = ExtractNetcdfContents(fname_in)
    tfm = Transformer(
        data=contents.dfs,
        custom_maps=contents.custom_maps,
        cbs=[
        ValidateEnumsCB(
            contents = contents,
            maris_enums=Enums(lut_src_dir=lut_path())
        ),
        RemoveNonCompatibleVariablesCB(vars=remap_vars),
        RemapCustomMapsCB(),
        RemapToORSpecificMappingsCB(output_format=output_format),
        AddTaxonInformationCB(
            fn_lut=lut_taxon
        ),
        DecodeTimeCB(),
        AddSampleTypeIdColumnCB(),
        AddZoteroArchiveLocationCB(contents.global_attrs, cfg=cfg()),
        DataFormatConversionCB(
            dtypes=remap_dtypes,
            excluded_mappings = get_excluded_enums,
            output_format=output_format
        ) 
        ]
    )    
    
    tfm()
    decoder = NetCDFDecoder( 
                            dfs=tfm.dfs,
                            fname_in=fname_in,  
                            dest_out=dest_out,                           
                            output_format='csv',
                            remap_vars=CSV_VARS,
                            verbose=verbose
                    )
    decoder.decode()

In [None]:
#|eval: false
fname = Path('../../_data/output/100-HELCOM-MORS-2024.nc')
decode(fname_in=fname, dest_out=fname.with_suffix(''), output_format='openrefine_csv')