In [None]:
#| default_exp callbacks

# Callbacks
> Callback used in handlers

In [None]:
#| export
import copy
import fastcore.all as fc
from operator import attrgetter
from cftime import date2num
import numpy as np

from marisco.configs import cfg

In [None]:
import pandas as pd

In [None]:
#| export
class Callback(): order = 0

In [None]:
#| export
def run_cbs(cbs, obj=None):
    for cb in sorted(cbs, key=attrgetter('order')):
        if cb.__doc__: obj.logs.append(cb.__doc__)
        cb(obj)

In [None]:
#| export
class Transformer():
    def __init__(self, dfs, cbs=None): 
        self.cbs = cbs
        #self.dfs = {k: v.copy() for k, v in dfs.items()}

        self.dfs = self._copy(dfs)
        
        self.logs = []

    def _copy(self, dfs):
        if isinstance(dfs, dict):
            return {k: v.copy() for k, v in dfs.items()}
        else:
            return dfs.copy()
        
    def callback(self):
        run_cbs(self.cbs, self)
        
    def unique(self, col_name):
        "Distinct values of a specific column present in all groups"
        columns = [df.get(col_name) for df in self.dfs.values() if df.get(col_name) is not None]
        values = np.concatenate(columns) if columns else []
        return np.unique(values)
        
    def __call__(self):
        if self.cbs: self.callback()
        return self.dfs

Example:

In [None]:
dfs = {'biota': pd.DataFrame({'id': [0, 1, 2], 'species': [0, 2, 0], 'depth': [2, 3, 4]}),
       'seawater': pd.DataFrame({'id': [0, 1, 2], 'depth': [3, 4, 5]})}
tfm = Transformer(dfs); tfm()
tfm.unique('species')

array([0, 2])

In [None]:
tfm.unique('non_existing_var')

array([], dtype=float64)

## Generic

In [None]:
#| export
class EncodeTimeCB(Callback):
    "Encode time as `int` representing seconds since xxx"    
    def __init__(self, cfg): fc.store_attr()
    def __call__(self, tfm): 
        def format_time(x): return date2num(x, units=self.cfg['units']['time'])
        
        for k in tfm.dfs.keys():
            tfm.dfs[k]['time'] = tfm.dfs[k]['time'].apply(format_time)

In [None]:
#| export
class SanitizeLonLatCB(Callback):
    "Drop row when both longitude & latitude equal 0. Drop unrealistic longitude & latitude values. Convert longitude & latitude `,` separator to `.` separator."
    def __init__(self, verbose=False): fc.store_attr()
    def __call__(self, tfm):
        for grp, df in tfm.dfs.items():
            " Convert `,` separator to `.` separator"
            df['lon'] = [float(str(x).replace(',', '.')) for x in df['lon']]
            df['lat'] = [float(str(x).replace(',', '.')) for x in df['lat']]
            
            # mask zero values
            mask_zeroes = (df.lon == 0) & (df.lat == 0) 
            nZeroes = mask_zeroes.sum()
            if nZeroes and self.verbose: 
                print(f'The "{grp}" group contains {nZeroes} data points whose (lon, lat) = (0, 0)')
            
            # mask gps out of bounds, goob. 
            mask_goob = (df.lon < -180) | (df.lon > 180) | (df.lat < -90) | (df.lat > 90)
            nGoob = mask_goob.sum()
            if nGoob and self.verbose: 
                print(f'The "{grp}" group contains {nGoob} data points whose lon or lat are unrealistic. Outside -90 to 90 for latitude and -180 to 180 for longitude.')
                
            tfm.dfs[grp] = df.loc[~(mask_zeroes | mask_goob)]

In [None]:
# Check that measurements located at (0,0) get removed
dfs = {'biota': pd.DataFrame({'lon': [0, 1, 0], 'lat': [0, 2, 0]})}
tfm = Transformer(dfs, cbs=[SanitizeLonLatCB()])
tfm()['biota']

expected = [1., 2.]
fc.test_eq(tfm()['biota'].iloc[0].to_list(), expected)

In [None]:
# Check that comma decimal separator get replaced by point instead
dfs = {'biota': pd.DataFrame({'lon': ['45,2'], 'lat': ['43,1']})}
tfm = Transformer(dfs, cbs=[SanitizeLonLatCB()])
tfm()['biota']

expected = [45.2, 43.1]
fc.test_eq(tfm()['biota'].iloc[0].to_list(), expected)

In [None]:
# Check that out of bounds lon or lat get removed
dfs = {'biota': pd.DataFrame({'lon': [-190, 190, 1, 2, 1.1], 'lat': [1, 2, 91, -91, 2.2]})}
tfm = Transformer(dfs, cbs=[SanitizeLonLatCB()])
tfm()['biota']

expected = [1.1, 2.2]
fc.test_eq(tfm()['biota'].iloc[0].to_list(), expected)