In [4]:
#| default_exp netcdf_to_csv

# NetCDF to Open Refine CSV (WIP)

***

## Packages import

mamba install dask --force-reinstall


In [5]:

from pathlib import Path # This module offers classes representing filesystem paths
import xarray as xr
from netCDF4 import Dataset
import pandas as pd
import xarray as xr
import numpy as np
from marisco.callbacks import (Callback, Transformer,
                               EncodeTimeCB, SanitizeLonLatCB)
import fastcore.all as fc # package that brings fastcore functionality, see https://fastcore.fast.ai/.
from cftime import num2pydate 
from marisco.configs import cfg, lut_path
from marisco.serializers import OpenRefineCsvEncoder

Get the current working directory (cwd). . 

In [6]:
Path.cwd()

Path('/home/marisco/downloads/marisco/nbs/handlers')

In [11]:
fname_in = '../../_data/output/ospar_19950103_2021214.nc'
fname_out = '../../_data/output/ospar_test.csv'

### Load NetCDF 

load netcdf4 data

In [12]:
def netcdf4_to_df(fname_in):
    # read nc file
    netcdf4_data = Dataset(fname_in, "r")
    # Create dictionary of dataframes
    dfs={}
    for group in (netcdf4_data.groups.keys()):
        ds = xr.open_dataset(fname_in, group=group,  decode_times=False)
        dfs[group]=ds.to_dataframe()
    netcdf4_data.close()
    return(dfs)

In [14]:
dfs = netcdf4_to_df(fname_in)
dfs['biota']

Unnamed: 0_level_0,lon,lat,time,bio_group,body_part
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4857,-3.501945e+00,4.345778e+01,1369353600,6,19
4859,-3.501945e+00,4.345778e+01,1369353600,6,19
4861,-3.501945e+00,4.345778e+01,1369353600,6,19
4863,-3.501945e+00,4.345778e+01,1369353600,6,19
4865,-3.501945e+00,4.345778e+01,1369353600,6,19
...,...,...,...,...,...
18446744073709551614,9.969210e+36,9.969210e+36,18446744073709551614,18446744073709551614,18446744073709551614
18446744073709551614,9.969210e+36,9.969210e+36,18446744073709551614,18446744073709551614,18446744073709551614
18446744073709551614,9.969210e+36,9.969210e+36,18446744073709551614,18446744073709551614,18446744073709551614
18446744073709551614,9.969210e+36,9.969210e+36,18446744073709551614,18446744073709551614,18446744073709551614


In [10]:
dfs['seawater'].columns

Index(['lon', 'lat', 'depth', 'time'], dtype='object')

### Transpose Nuclides 

In [None]:
#| export
# Define cols that are not nuclides
sample_cols_grp = {'seawater': ['sample','lon', 'lat', 'depth', 'time'],
           'sediment': ['sample', 'lon', 'lat', 'depth', 'time', 'sed_type'],
           'biota': ['sample', 'lon', 'lat', 'depth', 'time', 'species_id', 'body_part']}

In [None]:
#| export
class TransposeNuclideColumns(Callback):
    
    " Transpose NetCDF nuclide data."
    def __init__(self, cols_grp=sample_cols_grp): fc.store_attr()
    
    def __call__(self, tfm):
        for grp in tfm.dfs.keys():
            tfm.dfs[grp]=self.transpose_nuclides(tfm.dfs[grp], grp)
            
    def transpose_nuclides(self, df, group):
        sample_cols=self.cols_grp[group]
        nuclide_unc_unit_cols=list(set(df.columns) - set(sample_cols))
        unc_cols=[x for x in nuclide_unc_unit_cols if '_unc' in x]
        unit_cols=[x for x in nuclide_unc_unit_cols if '_unit' in x]
        dl_cols=[x for x in nuclide_unc_unit_cols if '_dl' in x]
        nuclide_cols= list(set(nuclide_unc_unit_cols) - set(unit_cols+unc_cols+dl_cols))
              
        # Transpose nuclide_cols
        nuclide_df=pd.melt(frame=df, id_vars=sample_cols, value_vars=nuclide_cols, var_name='nuclide', value_name='activity')
        
        # Transpose unc_cols
        unc_df=pd.melt(frame=df, id_vars=sample_cols, value_vars=unc_cols, var_name='nuclide', value_name='uncertainty')
        unc_df['nuclide'] = unc_df['nuclide'].str.replace('_unc', '')    
            
        # Transpose unit_cols
        unit_df=pd.melt(frame=df, id_vars=sample_cols, value_vars=unit_cols, var_name='nuclide', value_name='unit_id')       
        unit_df['nuclide'] = unit_df['nuclide'].str.replace('_unit', '')
        
        # Transpose dl_cols
        dl_df=pd.melt(frame=df, id_vars=sample_cols, value_vars=dl_cols, var_name='nuclide', value_name='dl')       
        dl_df['nuclide'] = dl_df['nuclide'].str.replace('_dl', '')        
        
        # Combine nuclide_df, unc_df, unit_df and dl_df
        combine_on=sample_cols+['nuclide']
        df = pd.merge(nuclide_df, unc_df,  how='outer', left_on= combine_on, right_on = combine_on)
        df = pd.merge(df, unit_df,  how='outer', left_on= combine_on, right_on = combine_on)
        df = pd.merge(df, dl_df,  how='outer', left_on= combine_on, right_on = combine_on)
        
        # Keep all rows where 'activity' is not 'nan' OR 'uncertainty' is not 'nan' OR 'dl' is not 'nan' OR'unit_id' not equal 0.
        df=df[(df['activity'].notna()) | (df['uncertainty'].notna()) | (df['dl'].notna()) | (df['unit_id'] != 0 )]
        return(df)            

In [None]:
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[TransposeNuclideColumns()])
tfm()

{'seawater':         sample        lon        lat  depth        time        nuclide  \
 8            0  11.078300  54.349998    0.0   515980800          cs134   
 9            0  11.078300  54.349998    0.0   515980800          cs137   
 38           1  10.316700  54.500000    0.0   516153600          cs134   
 39           1  10.316700  54.500000    0.0   516153600          cs137   
 69           2  21.026600  55.305000    0.0  1549929600          cs137   
 ...        ...        ...        ...    ...         ...            ...   
 144248    4818  18.235001  58.583302  460.0   838252800          cs134   
 144249    4818  18.235001  58.583302  460.0   838252800          cs137   
 144257    4818  18.235001  58.583302  460.0   838252800          pu238   
 144259    4818  18.235001  58.583302  460.0   838252800  pu239_240_tot   
 144265    4818  18.235001  58.583302  460.0   838252800           sr90   
 
            activity  uncertainty  unit_id  dl  
 8       1360.000000   476.000000    

***

### Parse Time

In [None]:
#| export
class ParseTimeCB(Callback):
    def __init__(self, cfg): fc.store_attr()
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k]['time'] = tfm.dfs[k]['time'].apply(self.format_time)
    def format_time(self, x): 
        return num2pydate(x, units=self.cfg['units']['time'])

In [None]:
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[TransposeNuclideColumns(),
                            ParseTimeCB(cfg())])
tfm()

{'seawater':         sample        lon        lat  depth       time        nuclide  \
 8            0  11.078300  54.349998    0.0 1986-05-09          cs134   
 9            0  11.078300  54.349998    0.0 1986-05-09          cs137   
 38           1  10.316700  54.500000    0.0 1986-05-11          cs134   
 39           1  10.316700  54.500000    0.0 1986-05-11          cs137   
 69           2  21.026600  55.305000    0.0 2019-02-12          cs137   
 ...        ...        ...        ...    ...        ...            ...   
 144248    4818  18.235001  58.583302  460.0 1996-07-25          cs134   
 144249    4818  18.235001  58.583302  460.0 1996-07-25          cs137   
 144257    4818  18.235001  58.583302  460.0 1996-07-25          pu238   
 144259    4818  18.235001  58.583302  460.0 1996-07-25  pu239_240_tot   
 144265    4818  18.235001  58.583302  460.0 1996-07-25           sr90   
 
            activity  uncertainty  unit_id  dl  
 8       1360.000000   476.000000        1 NaN  


***

### Sample Type 

In [None]:
#| export
# Define sample types groups
sample_type_lut = {'seawater': 1,
           'sediment': 2,
           'biota': 3,
           'suspended matter': 4}

In [None]:
#| export
class LookupSampleType(Callback):
    def __init__(self, lut=sample_type_lut): fc.store_attr()
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k]['samptype_id'] = self.lut[k]
            

In [None]:
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[TransposeNuclideColumns(),
                            ParseTimeCB(cfg()),
                            LookupSampleType()])
tfm()

{'seawater':         sample        lon        lat  depth       time        nuclide  \
 8            0  11.078300  54.349998    0.0 1986-05-09          cs134   
 9            0  11.078300  54.349998    0.0 1986-05-09          cs137   
 38           1  10.316700  54.500000    0.0 1986-05-11          cs134   
 39           1  10.316700  54.500000    0.0 1986-05-11          cs137   
 69           2  21.026600  55.305000    0.0 2019-02-12          cs137   
 ...        ...        ...        ...    ...        ...            ...   
 144248    4818  18.235001  58.583302  460.0 1996-07-25          cs134   
 144249    4818  18.235001  58.583302  460.0 1996-07-25          cs137   
 144257    4818  18.235001  58.583302  460.0 1996-07-25          pu238   
 144259    4818  18.235001  58.583302  460.0 1996-07-25  pu239_240_tot   
 144265    4818  18.235001  58.583302  460.0 1996-07-25           sr90   
 
            activity  uncertainty  unit_id  dl  samptype_id  
 8       1360.000000   476.000000   

***

### Nuclide lookup

In [None]:
#| export
def get_nucnames_lut():
    fname = lut_path() / 'dbo_nuclide.xlsx'
    df_nuclide = pd.read_excel(fname, usecols=['nuclide_id', 'nc_name'])
    return df_nuclide.set_index('nc_name').to_dict()['nuclide_id']

In [None]:
# | export
class LookupNuclideIdCB(Callback):
    "Lookup MARIS nuclide_id."
    def __init__(self,
                 fn_lut=get_nucnames_lut):
        fc.store_attr()

    def __call__(self, tfm):
        lut = self.fn_lut()
        for k in tfm.dfs.keys():
            tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)
            tfm.dfs[k]['nuclide_id']=tfm.dfs[k]['nuclide_id'].astype('int64')

In [None]:
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[TransposeNuclideColumns(),
                            ParseTimeCB(cfg()),
                            LookupSampleType(),
                            LookupNuclideIdCB(),
                            ])
tfm()

  tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)
  tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)
  tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)


{'seawater':         sample        lon        lat  depth       time        nuclide  \
 8            0  11.078300  54.349998    0.0 1986-05-09          cs134   
 9            0  11.078300  54.349998    0.0 1986-05-09          cs137   
 38           1  10.316700  54.500000    0.0 1986-05-11          cs134   
 39           1  10.316700  54.500000    0.0 1986-05-11          cs137   
 69           2  21.026600  55.305000    0.0 2019-02-12          cs137   
 ...        ...        ...        ...    ...        ...            ...   
 144248    4818  18.235001  58.583302  460.0 1996-07-25          cs134   
 144249    4818  18.235001  58.583302  460.0 1996-07-25          cs137   
 144257    4818  18.235001  58.583302  460.0 1996-07-25          pu238   
 144259    4818  18.235001  58.583302  460.0 1996-07-25  pu239_240_tot   
 144265    4818  18.235001  58.583302  460.0 1996-07-25           sr90   
 
            activity  uncertainty  unit_id  dl  samptype_id  nuclide_id  
 8       1360.000000   4

In [None]:
tfm.dfs['biota']

Unnamed: 0,sample,lon,lat,depth,time,species_id,body_part,nuclide,activity,uncertainty,unit_id,dl,samptype_id,nuclide_id
11,0,9.410000,54.310001,2.0,2011-12-11,50,52,co60,0.00848,,3,,3,9
12,0,9.410000,54.310001,2.0,2011-12-11,50,52,cs134,0.00545,,3,,3,31
14,0,9.410000,54.310001,2.0,2011-12-11,50,52,cs137,0.77600,0.029488,3,,3,33
20,0,9.410000,54.310001,2.0,2011-12-11,50,52,k40,117.00000,4.212000,3,,3,4
63,1,10.000000,54.450001,4.0,2011-12-13,99,52,co60,0.00972,,3,,3,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204062,3924,27.450001,59.279999,0.0,2003-11-26,50,1,cs137,5.80000,0.290000,3,,3,33
204114,3925,27.450001,59.279999,0.0,2004-12-30,50,1,cs137,6.50000,0.260000,3,,3,33
204166,3926,27.450001,59.279999,0.0,2002-10-25,50,1,cs137,9.76000,0.585600,3,,3,33
204218,3927,27.450001,59.279999,0.0,2005-12-14,50,1,cs137,6.80000,0.136000,3,,3,33


***

### Rename columns

Rename 

TODO: What is ref_id? 
Should/is 'sample' used to create a ref_id? 

TODO: List COI in order

TODO 'nuclide' will be dropped in the actual OR

In [None]:
#| export
# Define columns of interest by sample type
coi_grp = {'seawater': ['sample', 'lon', 'lat', 'depth', 'time', 'nuclide', 'activity',
                     'uncertainty', 'unit_id', 'dl', 'samptype_id', 'nuclide_id'],
       'sediment' : ['sample', 'lon', 'lat', 'depth', 'time', 'sed_type', 'nuclide',
                     'activity', 'uncertainty', 'unit_id', 'dl', 'samptype_id', 'nuclide_id'],
       'biota' : ['sample', 'lon', 'lat', 'depth', 'time', 'species_id', 'body_part',
                     'nuclide', 'activity', 'uncertainty', 'unit_id', 'dl', 'samptype_id', 'nuclide_id']}

In [None]:
#| export
# Define column names renaming rules
renaming_rules = {
    'lat': 'latitude',
    'lon': 'longitude',
    'time': 'begperiod',
    'depth': 'sampdepth',
    'nuclide': 'nuclide_id',
    'uncertainty': 'uncertaint',
    'dl': 'detection',
    'sed_type': 'sedtype_id (0)',
    'species_id': 'species_id (0)',
    'body_part': 'bodypar_id',
}

In [None]:
#| export
class RenameColumnCB(Callback):
    def __init__(self,
                 coi=coi_grp,
                 renaming_rules=renaming_rules):
        fc.store_attr()

    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            # Select cols of interest
            tfm.dfs[k] = tfm.dfs[k].loc[:, self.coi[k]]

            # Rename cols
            tfm.dfs[k].rename(columns=self.renaming_rules, inplace=True)

In [None]:
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[TransposeNuclideColumns(),
                            ParseTimeCB(cfg()),
                            LookupSampleType(),
                            LookupNuclideIdCB(),
                            RenameColumnCB()
                            ])
tfm()

  tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)
  tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)
  tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)


{'seawater':         sample  longitude   latitude  sampdepth  begperiod     nuclide_id  \
 8            0  11.078300  54.349998        0.0 1986-05-09          cs134   
 9            0  11.078300  54.349998        0.0 1986-05-09          cs137   
 38           1  10.316700  54.500000        0.0 1986-05-11          cs134   
 39           1  10.316700  54.500000        0.0 1986-05-11          cs137   
 69           2  21.026600  55.305000        0.0 2019-02-12          cs137   
 ...        ...        ...        ...        ...        ...            ...   
 144248    4818  18.235001  58.583302      460.0 1996-07-25          cs134   
 144249    4818  18.235001  58.583302      460.0 1996-07-25          cs137   
 144257    4818  18.235001  58.583302      460.0 1996-07-25          pu238   
 144259    4818  18.235001  58.583302      460.0 1996-07-25  pu239_240_tot   
 144265    4818  18.235001  58.583302      460.0 1996-07-25           sr90   
 
            activity  uncertaint  unit_id  detecti

***

### Encoding

In [None]:
#| export
def encode(fname_in, fname_out, **kwargs):
    dfs = netcdf4_to_df(fname_in)
    tfm = Transformer(dfs, cbs=[TransposeNuclideColumns(),
                                ParseTimeCB(cfg()),
                                LookupSampleType(),
                                LookupNuclideIdCB(),
                                RenameColumnCB()
                                ])
    
    encoder = OpenRefineCsvEncoder(tfm(), 
                            dest_fname=fname_out, 
                            **kwargs)
    encoder.encode()
    return encoder

In [None]:
encode(fname_in, fname_out, verbose=False)

  tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)
  tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)
  tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)


<marisco.serializers.OpenRefineCsvEncoder>