In [None]:
#|default_exp data.serializer

# Serializer

> MARIS `.csv` data Serializer classes

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#|export
from __future__ import annotations
from fastcore.basics import store_attr
import pandas as pd
import re
import glob
from pathlib import Path
import xarray as xr
import numpy as np

pd.set_option('display.max_rows', 500)

## Serializer -

In [None]:
#|export
class Serializer():
    """Serializer abstract class"""
    def __init__(self, 
                 fname:str, # File name path and name
                 dst:str, # Path of folder that will receive created .csv
                 col_id:str='ref_id', # Name of the unique id column in loaded .csv
                 #col_names:List[str]=['displaytext'] # Columns name as part of file name generated
                 col_names=['displaytext'] # Columns name as part of file name generated
                ):
        pass

## NetCDFSerializer -

In [None]:
#|export
class NetCDFSerializer():
    """Serialize MARIS .csv to NetCDF4"""
    def __init__(self, 
                 fname:str, # File name path and name
                 cols_station:[str]=['latitude', 'longitude'], # Name of the lon. and lat. columns
                 col_depth:str='sampdepth' # Name of the depth column
                ):
        store_attr()
        self.df = self.load_data()

    def load_data(self):
        self.df = pd.read_csv(Path(self.fname))   
        return self.df
    
    def preview_dims_len(self, verbose=False):
        #sparsity = 1 - np.sum(self._n_depth_by_stations().values) / (self.n_stations()*self.n_samples())
        print(f'NetCDF dimensions size: N_STATIONS: {self.n_stations()} | N_SAMPLES: {self.n_samples()}')
    
    def n_stations(self):
        return len(self.df[self.cols_station].drop_duplicates())
        
    def _n_depth_by_stations(self):
        return self.df.groupby(self.cols_station)[self.col_depth].nunique()
    
    def n_samples(self):
        return np.max(self._n_depth_by_stations().sort_values(ascending=False))

In [None]:
#|export
def flatten_cols(cols):
    new_cols = []
    for p1, p2 in cols:
        sep = '_' if p2 else ''
        new_cols.append(p2 + sep + p1)
    return new_cols

### Overview of MARIS datasets NetCDF dimensions

In [None]:
dst = '/Users/franckalbinet/pro/IAEA/MARIS/data/csv/*'
mrs_dims = {'name': [], 'n_stations': [], 'n_samples': []}
for f in glob.glob(dst):
    serializer = NetCDFSerializer(f)
    mrs_dims['name'].append(Path(f).name)
    mrs_dims['n_stations'].append(serializer.n_stations())
    mrs_dims['n_samples'].append(serializer.n_samples())
df_mrs_dims = pd.DataFrame(mrs_dims)

In [None]:
df_mrs_dims.sort_values(by=['n_stations', 'n_samples'], ascending=False).reset_index(drop=True)

Unnamed: 0,name,n_stations,n_samples
0,395-bailly-du-bois-et-al-2020.csv,23262,82
1,99-aoyama-and-hirose-2004.csv,5772,63
2,191-ospar-comission-s-radioactive-substances-c...,3151,8
3,84-maff-now-cefas-2004.csv,2679,24
4,402-cchdo-2018.csv,2353,33
5,95-ipsn-cea-now-irsn-2004.csv,1856,1
6,100-helcom-mors-2018.csv,1648,22
7,103-radnor-2010.csv,1621,13
8,237-takata-et-al-2018.csv,1261,17
9,120-nra-nuclear-regulation-authority-2013.csv,934,10


### Converting one dataset to NetCDF

In [None]:
#fname = 'files/41-aarkrog-et-al-1994-test.csv'
fname = '/Users/franckalbinet/pro/IAEA/MARIS/data/csv/84-maff-now-cefas-2004.csv'

In [None]:
serializer = NetCDFSerializer(fname)

In [None]:
serializer.preview_dims_len()

NetCDF dimensions size: N_STATIONS: 2679 | N_SAMPLES: 24


In [None]:
serializer.df.shape

(33306, 79)

In [None]:
# Should I sort it by data/time (begperiod)?
#serializer.df['N_STATIONS'] = list(zip(serializer.df.latitude, serializer.df.longitude))

In [None]:
df = serializer.df.copy()

In [None]:
df.columns

Index(['sample_id', 'area_id', 'areaname', 'samptype_id', 'samptype', 'ref_id',
       'displaytext', 'zoterourl', 'ref_note', 'datbase', 'lab_id', 'lab',
       'latitude', 'longitude', 'begperiod', 'endperiod', 'samplingyear',
       'totdepth', 'sampdepth', 'station', 'samplabcode', 'species_id',
       'taxonname', 'taxonrank', 'biogroup', 'taxondb', 'taxondbid',
       'taxondburl', 'taxonrepname', 'bodypar_id', 'bodypar', 'sliceup',
       'slicedown', 'sedtype_id', 'sedtype', 'sedrepname', 'nuclide_id',
       'nusymbol', 'volume', 'salinity', 'temperatur', 'filtered', 'filtpore',
       'samparea', 'drywt', 'wetwt', 'percentwt', 'sampmet_id', 'sampmet',
       'prepmet_id', 'prepmet', 'drymet_id', 'drymet', 'counmet_id', 'counmet',
       'decayedto', 'detection', 'activity', 'uncertaint', 'unit_id', 'unit',
       'vartype', 'freq', 'rangelow', 'rangeupp', 'profile', 'transect_id',
       'measure_note', 'shapetype_id', 'profile_id', 'sampnote',
       'ref_fulltext', 'ref_yea

In [None]:
df.head()

Unnamed: 0,latitude,longitude,sampdepth,nusymbol,activity,uncertaint,begperiod
24290,54.393333,-3.485,0.0,137Cs,166.87,,1962-01-08
12856,54.433333,-3.535556,0.0,134Cs,,,1962-01-08
24291,54.393333,-3.485,0.0,134Cs,,,1962-01-08
12855,54.433333,-3.535556,0.0,137Cs,179.82,,1962-01-08
11773,54.433333,-3.535556,0.0,134Cs,,,1962-01-22


In [None]:
# Convert `begperiod` to datetime
df['begperiod'] = pd.to_datetime(df['begperiod'])

In [None]:
df = df[['latitude', 'longitude', 'sampdepth', 'nusymbol', 'activity', 'uncertaint', 'begperiod']].sort_values(by="begperiod")

In [None]:
df.head()

Unnamed: 0,latitude,longitude,sampdepth,nusymbol,activity,uncertaint,begperiod
24290,54.393333,-3.485,0.0,137Cs,166.87,,1962-01-08
12856,54.433333,-3.535556,0.0,134Cs,,,1962-01-08
24291,54.393333,-3.485,0.0,134Cs,,,1962-01-08
12855,54.433333,-3.535556,0.0,137Cs,179.82,,1962-01-08
11773,54.433333,-3.535556,0.0,134Cs,,,1962-01-22


### Pivoting it to get `nusymbol` as column names

In [None]:
df['nusymbol'].unique()

array(['137Cs', '134Cs', '137Cs/134Cs'], dtype=object)

In [None]:
df.head()

Unnamed: 0,latitude,longitude,sampdepth,nusymbol,activity,uncertaint,begperiod
24290,54.393333,-3.485,0.0,137Cs,166.87,,1962-01-08
12856,54.433333,-3.535556,0.0,134Cs,,,1962-01-08
24291,54.393333,-3.485,0.0,134Cs,,,1962-01-08
12855,54.433333,-3.535556,0.0,137Cs,179.82,,1962-01-08
11773,54.433333,-3.535556,0.0,134Cs,,,1962-01-22


In [None]:
df_pivoted = pd.pivot_table(df, values=['activity', 'uncertaint'], 
                            index=['latitude', 'longitude', 'sampdepth', 'begperiod'],
                            columns=['nusymbol'])
df_pivoted.sort_values(by=['begperiod', 'sampdepth'], inplace=True)
df_pivoted.reset_index(inplace=True)
df_pivoted.columns = flatten_cols(df_pivoted.columns)
df_pivoted

Unnamed: 0,latitude,longitude,sampdepth,begperiod,134Cs_activity,137Cs_activity,137Cs/134Cs_activity,134Cs_uncertaint,137Cs_uncertaint,137Cs/134Cs_uncertaint
0,54.393333,-3.485000,0.0,1962-01-08,,166.87,,,,
1,54.433333,-3.535556,0.0,1962-01-08,,179.82,,,,
2,54.393333,-3.485000,0.0,1962-01-22,,186.48,,,,
3,54.433333,-3.535556,0.0,1962-01-22,,293.41,,,,
4,54.393333,-3.485000,0.0,1962-02-07,,335.96,,,,
...,...,...,...,...,...,...,...,...,...,...
12747,51.791389,1.167222,0.0,1989-12-19,26.76,221.16,8.3,1.5,1.55,0.47
12748,51.973333,1.381111,0.0,1989-12-19,,23.53,,,0.75,
12749,52.154722,1.606944,0.0,1989-12-19,,28.47,,,0.80,
12750,52.211667,1.626111,0.0,1989-12-19,,52.64,,,0.84,


### Creating `N_STATIONS` and `N_SAMPLES` indexes (NetCDF dimensions)

In [None]:
df_pivoted['N_STATIONS'] = list(zip(df_pivoted.latitude, df_pivoted.longitude))

In [None]:
df_pivoted.head()

Unnamed: 0,latitude,longitude,sampdepth,begperiod,134Cs_activity,137Cs_activity,137Cs/134Cs_activity,134Cs_uncertaint,137Cs_uncertaint,137Cs/134Cs_uncertaint,N_STATIONS
0,54.393333,-3.485,0.0,1962-01-08,,166.87,,,,,"(54.3933333333333, -3.485)"
1,54.433333,-3.535556,0.0,1962-01-08,,179.82,,,,,"(54.4333333333333, -3.53555555555556)"
2,54.393333,-3.485,0.0,1962-01-22,,186.48,,,,,"(54.3933333333333, -3.485)"
3,54.433333,-3.535556,0.0,1962-01-22,,293.41,,,,,"(54.4333333333333, -3.53555555555556)"
4,54.393333,-3.485,0.0,1962-02-07,,335.96,,,,,"(54.3933333333333, -3.485)"


In [None]:
idx, cols = pd.factorize(df_pivoted['N_STATIONS'])

In [None]:
df_pivoted['N_STATIONS'] = idx
df_pivoted.sort_values(by=['N_STATIONS', 'sampdepth'])

Unnamed: 0,latitude,longitude,sampdepth,begperiod,134Cs_activity,137Cs_activity,137Cs/134Cs_activity,134Cs_uncertaint,137Cs_uncertaint,137Cs/134Cs_uncertaint,N_STATIONS
0,54.393333,-3.485000,0.0,1962-01-08,,166.87,,,,,0
2,54.393333,-3.485000,0.0,1962-01-22,,186.48,,,,,0
4,54.393333,-3.485000,0.0,1962-02-07,,335.96,,,,,0
6,54.393333,-3.485000,0.0,1962-02-20,,426.98,,,,,0
8,54.393333,-3.485000,0.0,1962-03-08,,805.86,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...
12639,55.970833,-2.408611,0.0,1989-08-11,,24.94,,,1.50,,2645
12655,55.970833,-2.408611,0.0,1989-09-08,,22.31,,,1.49,,2645
12672,55.970833,-2.408611,0.0,1989-10-06,,24.18,,,1.50,,2645
12690,55.970833,-2.408611,0.0,1989-11-04,,26.61,,,1.57,,2645


In [None]:
station = 74
df_pivoted.query("N_STATIONS == @station")[['N_STATIONS', 'latitude', 'longitude', 'sampdepth', 'begperiod', 
                                            '134Cs_activity', '137Cs_activity', '137Cs/134Cs_activity']].sort_values(by='sampdepth')


Unnamed: 0,N_STATIONS,latitude,longitude,sampdepth,begperiod,134Cs_activity,137Cs_activity,137Cs/134Cs_activity
2401,74,56.666667,-6.133333,0.0,1972-10-15,22.94,182.04,7.9
1938,74,56.666667,-6.133333,0.0,1972-01-15,27.75,132.09,4.8
1975,74,56.666667,-6.133333,0.0,1972-03-15,49.58,284.16,5.7
2009,74,56.666667,-6.133333,0.0,1972-04-15,43.66,299.7,6.9
2030,74,56.666667,-6.133333,0.0,1972-05-15,37.74,281.57,7.5
2307,74,56.666667,-6.133333,0.0,1972-06-15,44.03,286.38,6.5
2329,74,56.666667,-6.133333,0.0,1972-07-15,22.94,138.75,6.0
2382,74,56.666667,-6.133333,0.0,1972-09-15,27.01,196.84,7.3
10698,74,56.666667,-6.133333,0.0,1984-11-25,16.19,212.06,13.1
2438,74,56.666667,-6.133333,0.0,1972-11-15,42.18,317.09,7.5


In [None]:
# Number of different depth measurements by station
df_pivoted.groupby('N_STATIONS')['sampdepth'].nunique().sort_values(ascending=False)

N_STATIONS
74      24
77      19
1006    14
1003    14
1005    13
        ..
1132     1
1131     1
1130     1
1129     1
2645     1
Name: sampdepth, Length: 2646, dtype: int64

In [None]:
#def get_rank(grp):
#    grp['N_SAMPLES'] = (grp['sampdepth'].rank() - 1).astype(int)
#    return grp

In [None]:
# df = df.groupby(['N_STATIONS','sampdepth']).apply(get_rank)