# NetCDF to Open Refine CSV (WIP)

***

## Packages import

mamba install dask --force-reinstall


In [11]:

from pathlib import Path # This module offers classes representing filesystem paths
import xarray as xr
from netCDF4 import Dataset
import pandas as pd
import xarray as xr
import numpy as np
from marisco.callbacks import (Callback, Transformer,
                               EncodeTimeCB, SanitizeLonLatCB)
import fastcore.all as fc # package that brings fastcore functionality, see https://fastcore.fast.ai/.
from cftime import num2pydate 
from marisco.configs import cfg, lut_path
from marisco.serializers import OpenRefineCsvEncoder

Get the current working directory (cwd).  

In [12]:
Path.cwd()

Path('/home/marisco/downloads/marisco/nbs/handlers')

In [13]:
fname_in = '../../_data/output/helcom.nc'
fname_out = '../../_data/output/helcom_test.csv'

***

### Load NetCDF 

load netcdf4 data

In [14]:
def netcdf4_to_df(fname_in):
    # read nc file
    netcdf4_data = Dataset(fname_in, "r")
    # Create dictionary of dataframes
    dfs={}
    for group in (netcdf4_data.groups.keys()):
        ds = xr.open_dataset(fname_in, group=group,  decode_times=False)
        dfs[group]=ds.to_dataframe()
    netcdf4_data.close()
    return(dfs)

***

In [15]:
dfs = netcdf4_to_df(fname_in)
dfs['seawater']

Unnamed: 0_level_0,lon,lat,depth,time,h3,h3_unc,h3_unit,k40,k40_unc,k40_unit,...,cm242_unit,cm244,cm244_unc,cm244_unit,pu239_240_tot,pu239_240_tot_unc,pu239_240_tot_unit,cm243_244_tot,cm243_244_tot_unc,cm243_244_tot_unit
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,11.078300,54.349998,0.0,515980800,,,0,,,0,...,0,,,0,,,0,,,0
1,10.316700,54.500000,0.0,516153600,,,0,,,0,...,0,,,0,,,0,,,0
2,21.026600,55.305000,0.0,1549929600,,,0,,,0,...,0,,,0,,,0,,,0
3,16.448299,55.348301,0.0,515980800,,,0,,,0,...,0,,,0,,,0,,,0
4,18.879999,55.500000,0.0,1280707200,,,0,,,0,...,0,,,0,,,0,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4814,18.233299,58.583302,459.0,613440000,6520.0,1791.099976,1,,,0,...,0,,,0,,,0,,,0
4815,18.233299,58.580002,460.0,769478400,,,0,,,0,...,0,,,0,0.00356,0.000494,1,,,0
4816,18.233299,58.583302,460.0,743472000,,,0,,,0,...,0,,,0,,,0,,,0
4817,18.231701,58.583302,460.0,965260800,2370.0,23.700001,1,,,0,...,0,,,0,,,0,,,0


In [16]:
dfs['seawater'].columns

Index(['lon', 'lat', 'depth', 'time', 'h3', 'h3_unc', 'h3_unit', 'k40',
       'k40_unc', 'k40_unit', 'mn54', 'mn54_unit', 'co60', 'co60_unc',
       'co60_unit', 'sr89', 'sr89_unit', 'sr90', 'sr90_unc', 'sr90_unit',
       'zr95', 'zr95_unc', 'zr95_unit', 'nb95', 'nb95_unc', 'nb95_unit',
       'tc99', 'tc99_unc', 'tc99_unit', 'ru103', 'ru103_unc', 'ru103_unit',
       'ru106', 'ru106_unc', 'ru106_unit', 'ag110m', 'ag110m_unc',
       'ag110m_unit', 'sb125', 'sb125_unc', 'sb125_unit', 'cs134', 'cs134_unc',
       'cs134_unit', 'cs137', 'cs137_unc', 'cs137_unit', 'ba140', 'ba140_unc',
       'ba140_unit', 'ce144', 'ce144_unc', 'ce144_unit', 'pb210', 'pb210_unc',
       'pb210_unit', 'po210', 'po210_unc', 'po210_unit', 'u234', 'u234_unit',
       'u238', 'u238_unit', 'np237', 'np237_unc', 'np237_unit', 'pu238',
       'pu238_unc', 'pu238_unit', 'pu239', 'pu239_unc', 'pu239_unit', 'pu240',
       'pu240_unc', 'pu240_unit', 'am241', 'am241_unc', 'am241_unit', 'cm242',
       'cm242_unc', 

### Transpose Nuclides 

In [17]:
#| export
# Define cols that are not nuclides
sample_cols_grp = {'seawater': ['sample','lon', 'lat', 'depth', 'time'],
           'sediment': ['sample', 'lon', 'lat', 'depth', 'time', 'sed_type'],
           'biota': ['sample', 'lon', 'lat', 'depth', 'time', 'species_id', 'body_part']}

In [18]:
#| export
class TransposeNuclideColumns(Callback):
    
    " Transpose NetCDF nuclide data."
    def __init__(self, cols_grp=sample_cols_grp): fc.store_attr()
    
    def __call__(self, tfm):
        for grp in tfm.dfs.keys():
            tfm.dfs[grp]=self.transpose_nuclides(tfm.dfs[grp], grp)
            
    def transpose_nuclides(self, df, group):
        sample_cols=self.cols_grp[group]
        nuclide_unc_unit_cols=list(set(df.columns) - set(sample_cols))
        unc_cols=[x for x in nuclide_unc_unit_cols if '_unc' in x]
        unit_cols=[x for x in nuclide_unc_unit_cols if '_unit' in x]
        dl_cols=[x for x in nuclide_unc_unit_cols if '_dl' in x]
        nuclide_cols= list(set(nuclide_unc_unit_cols) - set(unit_cols+unc_cols+dl_cols))
              
        # Transpose nuclide_cols
        nuclide_df=pd.melt(frame=df, id_vars=sample_cols, value_vars=nuclide_cols, var_name='nuclide', value_name='activity')
        
        # Transpose unc_cols
        unc_df=pd.melt(frame=df, id_vars=sample_cols, value_vars=unc_cols, var_name='nuclide', value_name='uncertainty')
        unc_df['nuclide'] = unc_df['nuclide'].str.replace('_unc', '')    
            
        # Transpose unit_cols
        unit_df=pd.melt(frame=df, id_vars=sample_cols, value_vars=unit_cols, var_name='nuclide', value_name='unit_id')       
        unit_df['nuclide'] = unit_df['nuclide'].str.replace('_unit', '')
        
        # Transpose dl_cols
        dl_df=pd.melt(frame=df, id_vars=sample_cols, value_vars=dl_cols, var_name='nuclide', value_name='dl')       
        dl_df['nuclide'] = dl_df['nuclide'].str.replace('_dl', '')        
        
        # Combine nuclide_df, unc_df, unit_df and dl_df
        combine_on=sample_cols+['nuclide']
        df = pd.merge(nuclide_df, unc_df,  how='outer', left_on= combine_on, right_on = combine_on)
        df = pd.merge(df, unit_df,  how='outer', left_on= combine_on, right_on = combine_on)
        df = pd.merge(df, dl_df,  how='outer', left_on= combine_on, right_on = combine_on)
        
        # Keep all rows where 'activity' is not 'nan' OR 'uncertainty' is not 'nan' OR 'dl' is not 'nan' OR'unit_id' not equal 0.
        df=df[(df['activity'].notna()) | (df['uncertainty'].notna()) | (df['dl'].notna()) | (df['unit_id'] != 0 )]
        return(df)            

In [19]:
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[TransposeNuclideColumns()])
tfm()

KeyError: "The following id_vars or value_vars are not present in the DataFrame: ['sample']"

***

### Parse Time

In [None]:
#| export
class ParseTimeCB(Callback):
    def __init__(self, cfg): fc.store_attr()
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k]['time'] = tfm.dfs[k]['time'].apply(self.format_time)
    def format_time(self, x): 
        return num2pydate(x, units=self.cfg['units']['time'])

In [None]:
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[TransposeNuclideColumns(),
                            ParseTimeCB(cfg())])
tfm()

{'seawater':         sample        lon        lat  depth       time nuclide    activity  \
 9           10  14.257800  53.942200   10.0 2012-06-13   cs137   23.500000   
 25          10  14.257800  53.942200   10.0 2012-06-13    sr90    4.950000   
 39          11  14.263300  53.948299   10.0 1997-07-30   cs137   40.000000   
 55          11  14.263300  53.948299   10.0 1997-07-30    sr90    7.100000   
 69          12  14.263300  53.948299   10.0 2000-08-20   cs137   52.099998   
 ...        ...        ...        ...    ...        ...     ...         ...   
 144218    4817  24.334999  65.634697   17.0 1992-05-25   cs134    2.100000   
 144219    4817  24.334999  65.634697   17.0 1992-05-25   cs137   22.000000   
 144221    4817  24.334999  65.634697   17.0 1992-05-25     k40  220.000000   
 144249    4818  24.334999  65.634697   17.0 1996-08-28   cs137   38.500000   
 144251    4818  24.334999  65.634697   17.0 1996-08-28     k40  525.000000   
 
         uncertainty  unit_id  dl  
 9

***

### Sample Type 

In [None]:
#| export
# Define sample types groups
sample_type_lut = {'seawater': 1,
           'sediment': 2,
           'biota': 3,
           'suspended matter': 4}

In [None]:
#| export
class LookupSampleType(Callback):
    def __init__(self, lut=sample_type_lut): fc.store_attr()
    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            tfm.dfs[k]['samptype_id'] = self.lut[k]
            

In [None]:
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[TransposeNuclideColumns(),
                            ParseTimeCB(cfg()),
                            LookupSampleType()])
tfm()

{'seawater':         sample        lon        lat  depth       time nuclide    activity  \
 9           10  14.257800  53.942200   10.0 2012-06-13   cs137   23.500000   
 25          10  14.257800  53.942200   10.0 2012-06-13    sr90    4.950000   
 39          11  14.263300  53.948299   10.0 1997-07-30   cs137   40.000000   
 55          11  14.263300  53.948299   10.0 1997-07-30    sr90    7.100000   
 69          12  14.263300  53.948299   10.0 2000-08-20   cs137   52.099998   
 ...        ...        ...        ...    ...        ...     ...         ...   
 144218    4817  24.334999  65.634697   17.0 1992-05-25   cs134    2.100000   
 144219    4817  24.334999  65.634697   17.0 1992-05-25   cs137   22.000000   
 144221    4817  24.334999  65.634697   17.0 1992-05-25     k40  220.000000   
 144249    4818  24.334999  65.634697   17.0 1996-08-28   cs137   38.500000   
 144251    4818  24.334999  65.634697   17.0 1996-08-28     k40  525.000000   
 
         uncertainty  unit_id  dl  sam

***

### Nuclide lookup

In [None]:
#| export
def get_nucnames_lut():
    fname = lut_path() / 'dbo_nuclide.xlsx'
    df_nuclide = pd.read_excel(fname, usecols=['nuclide_id', 'nc_name'])
    return df_nuclide.set_index('nc_name').to_dict()['nuclide_id']

In [None]:
# | export
class LookupNuclideIdCB(Callback):
    "Lookup MARIS nuclide_id."
    def __init__(self,
                 fn_lut=get_nucnames_lut):
        fc.store_attr()

    def __call__(self, tfm):
        lut = self.fn_lut()
        for k in tfm.dfs.keys():
            tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)
            tfm.dfs[k]['nuclide_id']=tfm.dfs[k]['nuclide_id'].astype('int64')

In [None]:
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[TransposeNuclideColumns(),
                            ParseTimeCB(cfg()),
                            LookupSampleType(),
                            LookupNuclideIdCB(),
                            ])
tfm()

  tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)
  tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)
  tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)


{'seawater':         sample        lon        lat  depth       time nuclide    activity  \
 9           10  14.257800  53.942200   10.0 2012-06-13   cs137   23.500000   
 25          10  14.257800  53.942200   10.0 2012-06-13    sr90    4.950000   
 39          11  14.263300  53.948299   10.0 1997-07-30   cs137   40.000000   
 55          11  14.263300  53.948299   10.0 1997-07-30    sr90    7.100000   
 69          12  14.263300  53.948299   10.0 2000-08-20   cs137   52.099998   
 ...        ...        ...        ...    ...        ...     ...         ...   
 144218    4817  24.334999  65.634697   17.0 1992-05-25   cs134    2.100000   
 144219    4817  24.334999  65.634697   17.0 1992-05-25   cs137   22.000000   
 144221    4817  24.334999  65.634697   17.0 1992-05-25     k40  220.000000   
 144249    4818  24.334999  65.634697   17.0 1996-08-28   cs137   38.500000   
 144251    4818  24.334999  65.634697   17.0 1996-08-28     k40  525.000000   
 
         uncertainty  unit_id  dl  sam

In [None]:
tfm.dfs['biota']

Unnamed: 0,sample,lon,lat,depth,time,species_id,body_part,nuclide,activity,uncertainty,unit_id,dl,samptype_id,nuclide_id
14,0,14.241,53.458000,0.000000,1998-09-04,280,1,cs137,15.300000,0.1836,3,,3,33
65,1,14.150,54.040001,14.000000,1987-06-27,59,1,cs134_137_tot,23.900000,1.1950,3,,3,76
116,2,14.150,54.040001,13.000000,1990-07-07,59,1,cs134,3.600000,1.6920,3,,3,31
117,2,14.150,54.040001,13.000000,1990-07-07,59,1,cs134_137_tot,37.200001,1.4880,3,,3,76
118,2,14.150,54.040001,13.000000,1990-07-07,59,1,cs137,32.799999,2.6240,3,,3,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
204299,3953,13.450,55.020000,48.799999,1989-12-12,122,59,sr90,7.600000,0.2888,3,,3,12
204302,3953,13.450,55.020000,48.799999,1989-12-12,122,59,th228,4.200000,0.2058,3,,3,57
204328,3954,11.330,54.188999,22.000000,1984-11-18,273,59,k40,50.599998,8.6020,3,,3,4
204374,3955,12.470,54.433998,22.000000,1989-06-03,120,59,cs137,3.070000,0.5526,3,,3,33


***

### Rename columns

Rename 

TODO: What is ref_id? 
Should/is 'sample' used to create a ref_id? 

TODO: List COI in order

TODO 'nuclide' will be dropped in the actual OR

In [None]:
#| export
# Define columns of interest by sample type
coi_grp = {'seawater': ['sample', 'lon', 'lat', 'depth', 'time', 'nuclide', 'activity',
                     'uncertainty', 'unit_id', 'dl', 'samptype_id', 'nuclide_id'],
       'sediment' : ['sample', 'lon', 'lat', 'depth', 'time', 'sed_type', 'nuclide',
                     'activity', 'uncertainty', 'unit_id', 'dl', 'samptype_id', 'nuclide_id'],
       'biota' : ['sample', 'lon', 'lat', 'depth', 'time', 'species_id', 'body_part',
                     'nuclide', 'activity', 'uncertainty', 'unit_id', 'dl', 'samptype_id', 'nuclide_id']}

In [None]:
#| export
# Define column names renaming rules
renaming_rules = {
    'lat': 'latitude',
    'lon': 'longitude',
    'time': 'begperiod',
    'depth': 'sampdepth',
    'nuclide': 'nuclide_id',
    'uncertainty': 'uncertaint',
    'dl': 'detection',
    'sed_type': 'sedtype_id (0)',
    'species_id': 'species_id (0)',
    'body_part': 'bodypar_id',
}

In [None]:
#| export
class RenameColumnCB(Callback):
    def __init__(self,
                 coi=coi_grp,
                 renaming_rules=renaming_rules):
        fc.store_attr()

    def __call__(self, tfm):
        for k in tfm.dfs.keys():
            # Select cols of interest
            tfm.dfs[k] = tfm.dfs[k].loc[:, self.coi[k]]

            # Rename cols
            tfm.dfs[k].rename(columns=self.renaming_rules, inplace=True)

In [None]:
dfs = netcdf4_to_df(fname_in)
tfm = Transformer(dfs, cbs=[TransposeNuclideColumns(),
                            ParseTimeCB(cfg()),
                            LookupSampleType(),
                            LookupNuclideIdCB(),
                            RenameColumnCB()
                            ])
tfm()

  tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)
  tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)
  tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)


{'seawater':         sample  longitude   latitude  sampdepth  begperiod  nuclide_id  \
 9           10  14.257800  53.942200       10.0 2012-06-13       cs137   
 25          10  14.257800  53.942200       10.0 2012-06-13        sr90   
 39          11  14.263300  53.948299       10.0 1997-07-30       cs137   
 55          11  14.263300  53.948299       10.0 1997-07-30        sr90   
 69          12  14.263300  53.948299       10.0 2000-08-20       cs137   
 ...        ...        ...        ...        ...        ...         ...   
 144218    4817  24.334999  65.634697       17.0 1992-05-25       cs134   
 144219    4817  24.334999  65.634697       17.0 1992-05-25       cs137   
 144221    4817  24.334999  65.634697       17.0 1992-05-25         k40   
 144249    4818  24.334999  65.634697       17.0 1996-08-28       cs137   
 144251    4818  24.334999  65.634697       17.0 1996-08-28         k40   
 
           activity  uncertaint  unit_id  detection  samptype_id  nuclide_id  
 9     

***

### Encoding

In [None]:
#| export
def encode(fname_in, fname_out, **kwargs):
    dfs = netcdf4_to_df(fname_in)
    tfm = Transformer(dfs, cbs=[TransposeNuclideColumns(),
                                ParseTimeCB(cfg()),
                                LookupSampleType(),
                                LookupNuclideIdCB(),
                                RenameColumnCB()
                                ])
    
    encoder = OpenRefineCsvEncoder(tfm(), 
                            dest_fname=fname_out, 
                            **kwargs)
    encoder.encode()
    return encoder

In [None]:
encode(fname_in, fname_out, verbose=False)

  tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)
  tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)
  tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide'].replace(lut)


<marisco.serializers.OpenRefineCsvEncoder at 0x7f68d95137f0>