In [None]:
#| default_exp data.core

# Data core

> Core functionality to transform and gather data

In [None]:
#|export
from __future__ import annotations
from fastcore.basics import store_attr
import pandas as pd
import re
from pathlib import Path
from tqdm import tqdm

## DumpExploder

In [None]:
#|export
class DumpExploder():
    """Exploding MARIS global .csv dump into distinct dataset-specific ones..."""
    def __init__(self, 
                 fname:str, # File name path and name
                 dst:str, # Path of folder that will receive created .csv
                 col_id:str='ref_id', # Name of the unique id column in loaded .csv
                 #cols_name:List[str]=['displaytext'] # Columns name as part of file name generated
                cols_name=['displaytext'] # Columns name as part of file name generated
                ):
        store_attr()
        self.df = self.load_data()
        self.dst = Path(dst)
        self.cols = [col_id] + cols_name
        
    def load_data(self):
        self.df = pd.read_csv(Path(self.fname))   
        return self.df
    
    def num_ds(self, verbose:Bool=False):
        if self.df is None:
            raise Exception('Run `.loadData() first: no data loaded yet')
        print(f'Number of distinct datasets: {len(self.df[self.col_id].unique())}')
        if verbose:
            print(self.df.drop_duplicates(subset=[self.col_id])[self.cols])     
    
    def explode(self):
        if self.df is None:
            self.loadData()
        grouped = self.df.groupby(self.col_id)
        print('Exploding MARIS global csv dump into distinct dataset-specific ones...')
        for _, group in tqdm(grouped):
            name = self._namer(group)
            group.to_csv(self.dst/name, index=False)

    def _namer(self, group):
        cols_name = group[self.cols].drop_duplicates().values[0]
        cols_name = [str(s) for s in cols_name]
        name = '-'.join(cols_name)
        return re.sub(r'\W+', '-', name).lower() + '.csv'

A bit of description ...

In [None]:
fname = '../files/csv/maris-dump-test.csv'
dst = 'files/exploded'

exploder = DumpExploder(fname, dst)

In [None]:
df = exploder.load_data()
print('Dataframe shape: ', df.shape)
print('Columns list', df.columns)

Dataframe shape:  (100, 79)
Columns list Index(['sample_id', 'area_id', 'areaname', 'samptype_id', 'samptype', 'ref_id',
       'displaytext', 'zoterourl', 'ref_note', 'datbase', 'lab_id', 'lab',
       'latitude', 'longitude', 'begperiod', 'endperiod', 'samplingyear',
       'totdepth', 'sampdepth', 'station', 'samplabcode', 'species_id',
       'taxonname', 'taxonrank', 'biogroup', 'taxondb', 'taxondbid',
       'taxondburl', 'taxonrepname', 'bodypar_id', 'bodypar', 'sliceup',
       'slicedown', 'sedtype_id', 'sedtype', 'sedrepname', 'nuclide_id',
       'nusymbol', 'volume', 'salinity', 'temperatur', 'filtered', 'filtpore',
       'samparea', 'drywt', 'wetwt', 'percentwt', 'sampmet_id', 'sampmet',
       'prepmet_id', 'prepmet', 'drymet_id', 'drymet', 'counmet_id', 'counmet',
       'decayedto', 'detection', 'activity', 'uncertaint', 'unit_id', 'unit',
       'vartype', 'freq', 'rangelow', 'rangeupp', 'profile', 'transect_id',
       'measure_note', 'shapetype_id', 'profile_id', 's

In [None]:
exploder.num_ds(verbose=True)

Number of distinct datasets: 28
    ref_id                                        displaytext
0      237                                Takata et al., 2018
1      682           NRA - Nuclear Regulation Authority, 2021
2      103                                       RADNOR, 2010
3      126                         Fukushima Prefecture, 2011
4      395                        Bailly du Bois et al., 2020
5      402                                        CCHDO, 2018
6      100                                  HELCOM MORS, 2018
9      681           NRA - Nuclear Regulation Authority, 2021
11      84                             MAFF (now Cefas), 2004
12     190                             Schlitzer et al., 2018
13     679         TEPCO - Tokyo Electric Power Company, 2021
15     400                                 Boyer et al., n.d.
16     121         TEPCO - Tokyo Electric Power Company, 2011
18     234                                Aoyama et al., 2013
28     680         TEPCO - Tokyo Elect

In [None]:
exploder.explode()

Exploding MARIS global csv dump into distinct dataset-specific ones...


100%|██████████████████████████████████████████████████████████████| 28/28 [00:00<00:00, 598.15it/s]


## Rules