In [None]:
#| default_exp handlers.generic

# Generic
> Generic data pipeline (handler) to convert datasets to `NetCDF` format

The input data is a dump from already imported MARIS datasets.


**Questions**:
1. do we put areaname?
2. when converted to tall to wide a sample might have several nuclide type, but their measurement methods might be different? Maybe put them in metadata or variable attribute?
3. is `decayedto` used and in what context? (question to be answered for all columns actually)
4. what are units of uncertainty?
5. do we keep detection?
6. which columns for which sample type?
7. f(nuclide) or f(nuclide, measurement) ? E.g volume, counmet, unit?
8. need areea_id -> area_name lut to be used as enumtype
9. ref_id vs. zoterourl vs. displaytext
   1.  e.g ref_id=129 and 130 points to the same dataset?

**Dev. board**: https://trello.com/b/IszgV1bj/marisco

## Packages import

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#| export
import pandas as pd
from tqdm import tqdm
from functools import partial
import fastcore.all as fc

from pathlib import Path

from marisco.callbacks import (Callback, Transformer)
# from marisco.metadata import (GlobAttrsFeeder, BboxCB,
#                               DepthRangeCB, TimeRangeCB,
#                               ZoteroCB, KeyValuePairCB)
from marisco.configs import lut_path, cdl_cfg


In [None]:
pd.set_option('display.max_rows', 100)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
fname_in = Path().home() / 'pro/data/maris/all-maris.txt'
dir_dest = '../../_data/output/dump'
# fname_out = '../../_data/output/helcom.nc'

## Utils

In [None]:
# | export
def load_dump(fname): 
    return pd.read_csv(fname, sep='\t', encoding='ISO-8859-1')   

def load_data(df:pd.DataFrame, # MARIS global dump 
                 ref_id:int, # Reference id of interest
                 ):
    "Load specific MARIS dataset through its ref_id."
    lut = {
        'Sediment': 'sediment',
        'Seawater': 'seawater',
        'Suspended matter': 'suspended-matter',
        'Biota': 'biota'}
    dfs = {}
    for name, grp in df[df.ref_id  == 695].groupby('samptype'):
        dfs[lut[name]] = grp
    return dfs

def get_zotero_key(df, ref_id):
    result = df[df.ref_id  == 695].zoterourl.unique()
    if len(result) > 1: print('Several Zotero records have been found, please check!')
    return result[0].split('/')[-1]

## Load data

In [None]:
df = load_dump(fname_in)

## Data transformation pipeline

### Normalize nuclide names

In [None]:
#| export
def get_varnames_lut():
    fname = lut_path() / 'dbo_nuclide.xlsx'
    df_nuclide = pd.read_excel(fname, usecols=['nuclide_id', 'nc_name'])
    return df_nuclide.set_index('nuclide_id').to_dict()['nc_name']

In [None]:
# | export
class RemapRdnNameCB(Callback):
    "Remap to MARIS radionuclide names."
    def __init__(self,
                 fn_lut=get_varnames_lut):
        fc.store_attr()

    def __call__(self, tfm):
        lut = self.fn_lut()
        for k in tfm.dfs.keys():
            tfm.dfs[k]['nuclide_id'] = tfm.dfs[k]['nuclide_id'].replace(lut)

In [None]:
dfs = load_data(df, 52)
tfm = Transformer(dfs, cbs=[RemapRdnNameCB()])

print(tfm()['biota']['nuclide_id'].unique())

['i131' 'cs134' 'cs137' 'k40' 'am241' 'pu239_240_tot' 'pu238']


### Rename columns

In [None]:
# Cols of interest
# To be added: endperiod, totdepth
# enums to add: area, lab, biogroup, sampmet
coi = ['latitude', 'longitude', 'begperiod', 
       'sampdepth', 'nuclide_id', 'activity', 'uncertaint', 
       'unit_id', 'detection', 'area_id', 'species_id', 'biogroup_id',
       'bodypar_id', 'sedtype_id', 'volume', 'salinity', 'temperatur',
       'filtered', 'sampmet_id', 'prepmet_id', 'counmet_id']

In [None]:
#| export
def renaming_rules():
    vars = cdl_cfg()['vars']
    # Define column names renaming rules
    return {
        'latitude': vars['defaults']['lat']['name'],
        'longitude': vars['defaults']['lon']['name'],
        'begperiod': vars['defaults']['time']['name'],
        'sampdepth': vars['defaults']['depth']['name'],
        'uncertaint': vars['suffixes']['uncertainty']['name'],
        'unit_id': vars['suffixes']['unit']['name'],
        'detection': vars['suffixes']['detection_limit']['name'],
        'area_id': vars['defaults']['area']['name'], 
        'biogroup_id': vars['bio']['biogroup']['name'],
        'bodypar_id': vars['bio']['body_part']['name'],
        'sedtype_id': vars['sed']['sediment_type']['name'],
        'volume': vars['suffixes']['volume']['name'],
        'salinity': vars['suffixes']['salinity']['name'],
        'temperatur': vars['suffixes']['temperature']['name'],
        'sampmet_id': vars['suffixes']['sampling_method']['name'],
        'prepmet_id': vars['suffixes']['preparation_method']['name'],
        'counmet_id': vars['suffixes']['counting_method']['name'],
        'activity': 'value',
    }

In [None]:
#| export
# Define columns of interest by sample type
coi_grp = {'seawater': ['NUCLIDE', 'VALUE_Bq/m³', 'ERROR%_m³', 'time',
                        'TDEPTH', 'LATITUDE (dddddd)', 'LONGITUDE (dddddd)'],
           'sediment': ['NUCLIDE', 'VALUE_Bq/kg', 'ERROR%_kg', 'time',
                        'TDEPTH', 'LATITUDE (dddddd)', 'LONGITUDE (dddddd)',
                        'sed_type'],
           'biota': ['NUCLIDE', 'VALUE_Bq/kg', 'ERROR%', 'time',
                     'SDEPTH', 'LATITUDE ddmmmm', 'LONGITUDE ddmmmm',
                     'species_id', 'body_part']}


In [None]:
#| export
# Define column names renaming rules
renaming_rules = {
    'NUCLIDE': 'nuclide',
    'VALUE_Bq/m³': 'value',
    'VALUE_Bq/kg': 'value',
    'ERROR%_m³': 'unc',
    'ERROR%_kg': 'unc',
    'ERROR%': 'unc',
    'TDEPTH': 'depth',
    'SDEPTH': 'depth',
    'LATITUDE (dddddd)': 'lat',
    'LATITUDE ddmmmm': 'lat',
    'LONGITUDE (dddddd)': 'lon',
    'LONGITUDE ddmmmm': 'lon'
}

In [None]:
df.groupby(['ref_id', 'samptype']).size().reset_index(name="Count").sort_values(by=['ref_id', 'Count'], ascending=False)

Unnamed: 0,ref_id,samptype,Count
602,717,Seawater,1058
601,716,Biota,3
600,712,Sediment,36
599,711,Sediment,370
598,709,Biota,13
...,...,...,...
5,9,Suspended matter,12
3,8,Sediment,50
2,6,Seawater,17
1,3,Seawater,18


In [None]:
df.groupby(['ref_id', 'samptype']).size()\
    .reset_index(name="Count").groupby('ref_id').size()\
    .reset_index(name="Count")\
    .sort_values(by='Count', ascending=False)

Unnamed: 0,ref_id,Count
24,52,4
444,695,4
363,497,4
423,567,4
125,205,3
...,...,...
168,254,1
167,253,1
166,252,1
165,251,1


In [None]:
## Testing on MacGarry et al., 1994

In [None]:
# def load_data(df:pd.DataFrame, # MARIS global dump 
#                  ref_id:int, # Reference id of interest
#                  ):
#     lut = {
#         'Sediment': 'sediment',
#         'Seawater': 'seawater',
#         'Suspended matter': 'suspended-matter',
#         'Biota': 'biota'}
#     dfs = {}
#     for name, grp in df[df.ref_id  == 695].groupby('samptype'):
#         dfs[lut[name]] = grp
#     return dfs

# dfs = load_data(df, 52)

['i131' 'cs134' 'cs137' 'k40' 'am241' 'pu239_240_tot' 'pu238']


In [None]:
# Cols of interest
coi = ['latitude', 'longitude', 'begperiod', 'endperiod', 
       'sampdepth', 'totdepth', 'nuclide_id', 'activity', 'uncertaint', 
       'unit_id', 'detection', 'area_id', 'zoterourl', 'lab_id', 'species_id', 'biogroup_id',
       'bodypar_id', 'sedtype_id', 'volume', 'salinity', 'temperatur',
       'filtered', 'sampmet_id', 'prepmet_id', 'counmet_id']

In [None]:
df_biota = dfs['biota'][coi]; df_biota

Unnamed: 0,latitude,longitude,begperiod,endperiod,sampdepth,totdepth,nuclide_id,activity,uncertaint,unit_id,...,biogroup_id,bodypar_id,sedtype_id,volume,salinity,temperatur,filtered,sampmet_id,prepmet_id,counmet_id
533970,53.367778,-6.144167,00:00.0,,,,29,53.00,,5,...,11,0,0,,,,,0,0,0
533971,53.367778,-6.144167,00:00.0,,,,31,0.20,,5,...,11,0,0,,,,,0,0,0
533972,53.367778,-6.144167,00:00.0,,,,33,2.80,,5,...,11,0,0,,,,,0,0,0
533973,53.367778,-6.144167,00:00.0,,,,4,271.00,,5,...,11,0,0,,,,,0,0,0
533974,53.367778,-6.144167,00:00.0,,,,29,63.00,,5,...,11,0,0,,,,,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
810691,54.100833,-6.200278,00:00.0,,,,31,0.10,,5,...,14,0,0,,,,,0,0,0
810692,54.100833,-6.200278,00:00.0,,,,33,1.00,,5,...,14,0,0,,,,,0,0,0
810693,54.100833,-6.200278,00:00.0,,,,67,0.02,,5,...,14,0,0,,,,,0,0,0
810694,54.100833,-6.200278,00:00.0,,,,72,0.05,,5,...,14,0,0,,,,,0,0,0


In [None]:
vals = ['activity', 'uncertaint']

In [None]:
# class ReshapeLongToWide(Callback):
#     def __init__(self): fc.store_attr()

#     def __call__(self, tfm):
#         for k in tfm.dfs.keys():
#             cols = ['nuclide']
#             vals = ['value', 'unc']
#             idx = list(set(tfm.dfs[k].columns) -
#                        set(cols + vals))  # All others

#             tfm.dfs[k] = tfm.dfs[k].pivot_table(index=idx,
#                                                 columns=cols,
#                                                 values=vals).reset_index()

#             # Flatten cols name
#             tfm.dfs[k].columns = rename_cols(tfm.dfs[k].columns)

#             # Set index
#             tfm.dfs[k].index.name = 'sample'

In [None]:
# create new enums type

{0: nan,
 1: 'h3',
 2: 'be7',
 3: 'c14',
 4: 'k40',
 5: 'cr51',
 6: 'mn54',
 7: 'co57',
 8: 'co58',
 9: 'co60',
 10: 'zn65',
 11: 'sr89',
 12: 'sr90',
 13: 'zr95',
 14: 'nb95',
 15: 'tc99',
 16: 'ru103',
 17: 'ru106',
 18: 'rh106',
 19: 'ag106m',
 20: 'ag108',
 21: 'ag108m',
 22: 'ag110m',
 23: 'sb124',
 24: 'sb125',
 25: 'te129m',
 28: 'i129',
 29: 'i131',
 30: 'cs127',
 31: 'cs134',
 33: 'cs137',
 34: 'ba140',
 35: 'la140',
 36: 'ce141',
 37: 'ce144',
 38: 'pm147',
 39: 'eu154',
 40: 'eu155',
 41: 'pb210',
 42: 'pb212',
 43: 'pb214',
 44: 'bi207',
 45: 'bi211',
 46: 'bi214',
 47: 'po210',
 48: 'rn220',
 49: 'rn222',
 50: 'ra223',
 51: 'ra224',
 52: 'ra225',
 53: 'ra226',
 54: 'ra228',
 55: 'ac228',
 56: 'th227',
 57: 'th228',
 59: 'th232',
 60: 'th234',
 61: 'pa234',
 62: 'u234',
 63: 'u235',
 64: 'u238',
 65: 'np237',
 66: 'np239',
 67: 'pu238',
 68: 'pu239',
 69: 'pu240',
 70: 'pu241',
 71: 'am240',
 72: 'am241',
 73: 'cm242',
 74: 'cm243',
 75: 'cm244',
 76: 'cs134_137_tot',
 77: 

In [None]:
# select coi
# remove cols where only NaN
# rename cols where necessary
# rename nuclide_id using lut
# long to wide from nuclide_id (including all dependent vars like unit_id, ...)
# adapt detection to sanitized enums
# sanitize lat, lon
# convert time

In [None]:
test_str = 'https://www.zotero.org/groups/2432820/maris/items/3W354SQG'

In [None]:
test_str.split('/')[-1]

'3W354SQG'

In [None]:
df[df.ref_id  == 695].begperiod.unique()

array(['00:00.0'], dtype=object)

In [None]:
# by ref id
for name, grp in df.groupby(['ref_id']):
    print(name, grp[['ref_id', 'displaytext']])

(2,)         ref_id                 displaytext
714047       2  Crusius and Anderson, 1991
714048       2  Crusius and Anderson, 1991
714049       2  Crusius and Anderson, 1991
714050       2  Crusius and Anderson, 1991
714051       2  Crusius and Anderson, 1991
714052       2  Crusius and Anderson, 1991
714053       2  Crusius and Anderson, 1991
714054       2  Crusius and Anderson, 1991
714055       2  Crusius and Anderson, 1991
714056       2  Crusius and Anderson, 1991
714057       2  Crusius and Anderson, 1991
714058       2  Crusius and Anderson, 1991
714059       2  Crusius and Anderson, 1991
714060       2  Crusius and Anderson, 1991
714061       2  Crusius and Anderson, 1991
714062       2  Crusius and Anderson, 1991
714063       2  Crusius and Anderson, 1991
714064       2  Crusius and Anderson, 1991
714065       2  Crusius and Anderson, 1991
714066       2  Crusius and Anderson, 1991
714067       2  Crusius and Anderson, 1991
714068       2  Crusius and Anderson, 1991
714069

In [None]:
df.head()

Unnamed: 0,sample_id,area_id,areaname,samptype_id,samptype,ref_id,displaytext,zoterourl,ref_note,datbase,...,profile_id,sampnote,ref_fulltext,ref_yearpub,ref_sampleTypes,LongLat,shiftedcoordinates,shiftedlong,shiftedlat,id
0,594163,1904,Indian Ocean,1,Seawater,402,"CCHDO, 2018",https://www.zotero.org/groups/2432820/maris/it...,"Data downloaded from: Oms (2018), Tritium in o...",,...,402.1304,SOURCE FILE NAME: 316N145_7_00398_00001_hy1.nc.,"CCHDO, 2018. CCHDO (CLIVAR and Carbon Hydrogra...",1984,1,"89.373,-31.597",0xE6100000010CF085C954C1983FC0A4703D0AD7575640,89.3725,-31.596667,1
1,594135,1904,Indian Ocean,1,Seawater,402,"CCHDO, 2018",https://www.zotero.org/groups/2432820/maris/it...,"Data downloaded from: Oms (2018), Tritium in o...",,...,402.1303,SOURCE FILE NAME: 33RR20090320_00181_00002_hy1...,"CCHDO, 2018. CCHDO (CLIVAR and Carbon Hydrogra...",1984,1,"109.551,-31.595",0xE6100000010C63EE5A423E983FC02506819543635B40,109.551389,-31.594722,2
2,594143,1904,Indian Ocean,1,Seawater,402,"CCHDO, 2018",https://www.zotero.org/groups/2432820/maris/it...,"Data downloaded from: Oms (2018), Tritium in o...",,...,402.1303,SOURCE FILE NAME: 33RR20090320_00181_00002_hy1...,"CCHDO, 2018. CCHDO (CLIVAR and Carbon Hydrogra...",1984,1,"109.551,-31.595",0xE6100000010C63EE5A423E983FC02506819543635B40,109.551389,-31.594722,3
3,594160,1904,Indian Ocean,1,Seawater,402,"CCHDO, 2018",https://www.zotero.org/groups/2432820/maris/it...,"Data downloaded from: Oms (2018), Tritium in o...",,...,402.1304,SOURCE FILE NAME: 316N145_7_00398_00001_hy1.nc.,"CCHDO, 2018. CCHDO (CLIVAR and Carbon Hydrogra...",1984,1,"89.373,-31.597",0xE6100000010CF085C954C1983FC0A4703D0AD7575640,89.3725,-31.596667,4
4,594152,1904,Indian Ocean,1,Seawater,402,"CCHDO, 2018",https://www.zotero.org/groups/2432820/maris/it...,"Data downloaded from: Oms (2018), Tritium in o...",,...,402.1303,SOURCE FILE NAME: 33RR20090320_00181_00002_hy1...,"CCHDO, 2018. CCHDO (CLIVAR and Carbon Hydrogra...",1984,1,"109.551,-31.595",0xE6100000010C63EE5A423E983FC02506819543635B40,109.551389,-31.594722,5


In [None]:
len(df.ref_id.unique())

460

In [None]:
len(df.displaytext.unique())

421

In [None]:
len(df.ref_fulltext.unique())

458

In [None]:
df[df.ref_fulltext.str.contains('Wada, T.')][['ref_id', 'zoterourl']].drop_duplicates()

Unnamed: 0,ref_id,zoterourl
61510,130,https://www.zotero.org/groups/2432820/maris/items/2CQCUDR6
65792,129,https://www.zotero.org/groups/2432820/maris/items/4YNBL5M7


In [None]:
df[df.ref_fulltext == 'CNESTEN, 2020. Unpublished sediment data from CNESTEN, Morocco.'][['ref_id', 'zoterourl']].drop_duplicates()

Unnamed: 0,ref_id,zoterourl
444509,371,https://www.zotero.org/groups/2432820/maris/items/TPPY9XJQ
449061,372,https://www.zotero.org/groups/2432820/maris/items/7Q6V7RI7


In [None]:
df[['ref_id', 'ref_fulltext']].drop_duplicates().groupby('ref_fulltext').size().reset_index(name="Count").sort_values(by='Count', ascending=False)

Unnamed: 0,ref_fulltext,Count
72,"CNESTEN, 2020. Unpublished sediment data from CNESTEN, Morocco.",2
431,"Wada, T., Fujita, T., Nemoto, Y., Shimamura, S., Mizuno, T., Sohtome, T., Kamiyama, K., Narita, K., Watanabe, M., Hatta, N., Ogata, Y., Morita, T., Igarashi, S., 2016. Effects of the nuclear disaster on marine products in Fukushima: An update after fi...",2
301,"NRA - Nuclear Regulation Authority, 2021. Readings of Sea Area Monitoring - Monitoring of sea water - Outer sea area - Readings of Sea Area Monitoring at the Outer Sea of Miyagi, Fukushima, Ibaraki and Chiba Pref. [NRA].",1
312,"Noureddine, A., Benkrid, M., Maoui, R., Menacer, M., Boudjenoun, R., 2007. Distribution of natural radioactivity, 137Cs, 90Sr, and plutonium isotopes in a water column and sediment core along the Algerian coast. Science and Technology of Nuclear Insta...",1
311,"Noureddine, A., Benkrid, M., Hammadi, A., Boudjenoun, R., Menacer, M., Khaber, A., Kecir, M.S., 2003. Radioactivity distribution in surface and core sediment of the central part of the Algerian coast: an estimation of the recent sedimentation rate. Me...",1
310,"Norwegian Radiation Protection Authority, 2010. RADNOR - Radioactive dose assessment improvements for the Nordic marine environment: Transport and environmental impact of technetium 99 (99Tc) in marine ecosystems.",1
309,"Nonova, T., 2016. 90Sr, 210Pb, 210Po and Ra isotopes in marine macroalgae and mussel Mytilus galloprovincialis from the Bulgarian Black Sea zone. J Radioanal Nucl Chem 12.",1
308,"Nonova, T., 2014. Cesium and strontium in Black Sea macroalgae. Journal of Environmental Radioactivity 9.",1
307,"Nies, H., 1989. Plutonium and 137Cs in the water columns of the Northeast Atlantic, in: Interim Oceanographic Description of the North-East Atlantic Site for the Disposal of Low-Level Radioactive Waste. Nuclear Energy Agency of the OECD, pp. 77û81.",1
306,"Nakamura, T., Kimura, O., Matsuda, A., Matsuishi, T., Kobayashi, M., Endo, T., 2015. Radiocesium contamination of cetaceans stranded along the coast of Hokkaido, Japan, and an estimation of their travel routes. Marine Ecology Progress Series 535, 1û9....",1


In [None]:
df_aoyama = df[df.displaytext == 'Aoyama et al., 2013']

df_aoyama[df_aoyama.ref_id == 233].head(2)

Unnamed: 0,sample_id,area_id,areaname,samptype_id,samptype,ref_id,displaytext,zoterourl,ref_note,datbase,...,profile_id,sampnote,ref_fulltext,ref_yearpub,ref_sampleTypes,LongLat,shiftedcoordinates,shiftedlong,shiftedlat,id
24601,496691,1908,North Pacific Ocean,1,Seawater,233,"Aoyama et al., 2013",https://www.zotero.org/groups/2432820/maris/items/VT6S3YA5,,,...,,,"Aoyama, M., Tsumune, D., Hamajima, Y., 2013. Distribution of 137Cs and 134Cs in the North Pacific Ocean: impacts of the TEPCO Fukushima-Daiichi NPP accident. J Radioanal Nucl Chem 296, 535û539. https://doi.org/10.1007/s10967-012-2033-2",2013,1,"170.458,41.318",0xE6100000010CC66D3480B7A844402DB29DEFA74E6540,170.458056,41.318056,24602
24602,496697,1908,North Pacific Ocean,1,Seawater,233,"Aoyama et al., 2013",https://www.zotero.org/groups/2432820/maris/items/VT6S3YA5,,,...,,,"Aoyama, M., Tsumune, D., Hamajima, Y., 2013. Distribution of 137Cs and 134Cs in the North Pacific Ocean: impacts of the TEPCO Fukushima-Daiichi NPP accident. J Radioanal Nucl Chem 296, 535û539. https://doi.org/10.1007/s10967-012-2033-2",2013,1,"-151.958,43.008",0xE6100000010C7E8CB96B098145402DB29DEFA7FE62C0,-151.958056,43.008056,24603


In [None]:
df_aoyama[df_aoyama.ref_id == 234].head(2)

Unnamed: 0,sample_id,area_id,areaname,samptype_id,samptype,ref_id,displaytext,zoterourl,ref_note,datbase,...,profile_id,sampnote,ref_fulltext,ref_yearpub,ref_sampleTypes,LongLat,shiftedcoordinates,shiftedlong,shiftedlat,id
24627,496723,1908,North Pacific Ocean,1,Seawater,234,"Aoyama et al., 2013",https://www.zotero.org/groups/2432820/maris/items/2J83TK5F,,,...,,9042.7 km from FDNPS. Included in JAEA website (https://emdb.jaea.go.jp/emdb/en/portals/1060103000/).,"Aoyama, M., Uematsu, M., Tsumune, D., Hamajima, Y., 2013. Surface pathway of radioactive plume of TEPCO Fukushima NPP1 released 134Cs and 137Cs. Biogeosciences 10, 3067û3078. https://doi.org/10.5194/bg-10-3067-2013",2013,1,"-120.36,25.42",0xE6100000010CEC51B81E856B3940D7A3703D0A175EC0,-120.36,25.42,24628
24628,496723,1908,North Pacific Ocean,1,Seawater,234,"Aoyama et al., 2013",https://www.zotero.org/groups/2432820/maris/items/2J83TK5F,,,...,,9042.7 km from FDNPS. Included in JAEA website (https://emdb.jaea.go.jp/emdb/en/portals/1060103000/).,"Aoyama, M., Uematsu, M., Tsumune, D., Hamajima, Y., 2013. Surface pathway of radioactive plume of TEPCO Fukushima NPP1 released 134Cs and 137Cs. Biogeosciences 10, 3067û3078. https://doi.org/10.5194/bg-10-3067-2013",2013,1,"-120.36,25.42",0xE6100000010CEC51B81E856B3940D7A3703D0A175EC0,-120.36,25.42,24629


In [None]:
df.groupby(by=['displaytext', 'ref_id']).size().reset_index(name="Count")

Unnamed: 0,displaytext,ref_id,Count
0,"ASPAMARD, 2004",97,1128
1,"Aarkrog et al., 1989",44,141
2,"Aarkrog et al., 1992",31,137
3,"Aarkrog et al., 1994",41,196
4,"Ababneh et al., 2018",500,20
5,"Abd Rahim Mohamed and Feong Kuan, 2005",437,16
6,"Abdullah et al., 2015",502,30
7,"Ademola and Ehiedu, 2010",343,75
8,"Ak÷zcan and U?ur, 2013",284,24
9,"Ak÷zcan, 2013",530,23


In [None]:
pd.set_option('display.max_rows', 700)
df.groupby(by=['displaytext', 'samptype_id', 'ref_id']).size().reset_index(name="Count")

Unnamed: 0,displaytext,samptype_id,ref_id,Count
0,"ASPAMARD, 2004",1,97,799
1,"ASPAMARD, 2004",3,97,329
2,"Aarkrog et al., 1989",1,44,141
3,"Aarkrog et al., 1992",1,31,137
4,"Aarkrog et al., 1994",1,41,196
5,"Ababneh et al., 2018",2,500,20
6,"Abd Rahim Mohamed and Feong Kuan, 2005",2,437,16
7,"Abdullah et al., 2015",2,502,30
8,"Ademola and Ehiedu, 2010",2,343,45
9,"Ademola and Ehiedu, 2010",3,343,30


In [None]:
len(df.ref_fulltext.unique())

458

In [None]:
len(df.ref_id.unique())

460

In [None]:
df.area_id.unique()

array([1904, 1906, 1907, 1908, 1910, 1912, 1914, 2350, 2351, 2353, 2356,
       2357, 2359, 2374, 2379, 2389, 2401, 2402, 2407, 2409, 3141, 3314,
       3315, 3319, 3322, 3324, 3346, 3351, 3363, 3369, 3386, 4245, 4246,
       4247, 4248, 4250, 4252, 4253, 4256, 4257, 4261, 4262, 4263, 4264,
       4265, 4266, 4267, 4268, 4269, 4273, 4274, 4275, 4276, 4279, 4280,
       4283, 4286, 4287, 4288, 4290, 4291, 4300, 4302, 4303, 4306, 4307,
       4309, 4310, 4312, 4313, 4314, 4332, 4334, 4336, 4338, 4339, 4341,
       4344, 4347, 4358, 4359, 4360, 4361, 4364, 4365, 4366, 5698, 9999])

In [None]:
df.areaname.unique()

array(['Indian Ocean', 'Arctic Ocean', 'Southern Ocean',
       'North Pacific Ocean', 'South Pacific Ocean',
       'North Atlantic Ocean', 'South Atlantic Ocean', 'North Sea',
       'Celtic Sea', 'Norwegian Sea', 'Greenland Sea',
       "Irish Sea and St. George's Channel", 'Bay of Biscay', 'Kattegat',
       'Skagerrak', 'English Channel', 'Baltic Sea', 'Gulf of Bothnia',
       'Gulf of Finland', 'Gulf of Riga', 'Bristol Channel',
       'Adriatic Sea', 'Aegean Sea', 'Black Sea',
       'Balearic (Iberian Sea)', 'Alboran Sea', 'Strait of Gibraltar',
       'Ionian Sea', 'Ligurian Sea', 'Sea of Marmara', 'Tyrrhenian Sea',
       'Laptev Sea', 'Kara Sea', 'Barentsz Sea', 'White Sea',
       'Davis Strait', 'Hudson Bay', 'Baffin Bay', 'Beaufort Sea',
       'Chukchi Sea', 'Mozambique Channel', 'Gulf of Suez',
       'Gulf of Aqaba', 'Red Sea', 'Gulf of Aden', 'Persian Gulf',
       'Gulf of Oman', 'Arabian Sea', 'Laccadive Sea', 'Bay of Bengal',
       'Andaman or Burma Sea', 'Malacc

In [None]:
# first fetch json: https://maris-uat.iaea.org/swagger/index.html?url=/api/specification.json#/AreaGroupLookup/AreaGroupLookup_GetAll
df_area = pd.DataFrame([area for area in areas if (("." not in area['areaId']) and ("-" not in area['areaId']))])
df_area['areaId'] = df_area['areaId'].astype('int')
df_area.sort_values(by='areaId').to_excel('../files/lut/dbo_area.xlsx', index=False)

In [None]:
# 402-cchdo-2018.nc

In [None]:
pd.set_option('display.max_colwidth', 255)
df.zoterourl[:2]

0    https://www.zotero.org/groups/2432820/maris/items/GSALIT9M
1    https://www.zotero.org/groups/2432820/maris/items/GSALIT9M
Name: zoterourl, dtype: object

In [None]:
df.ref_id[:2]

0    402
1    402
Name: ref_id, dtype: int64

In [None]:
df.taxonname.unique()

array(['(Not available)', 'Penaeus indicus', 'Acanthephyra quadrispinosa',
       ..., 'Rajidae', 'Scophthalmus rhombus', 'Capros aper'],
      dtype=object)

In [None]:
df.ref_note[:2]

0    Data downloaded from: Oms (2018), Tritium in ocean
1    Data downloaded from: Oms (2018), Tritium in ocean
Name: ref_note, dtype: object

In [None]:
df.columns

Index(['sample_id', 'area_id', 'areaname', 'samptype_id', 'samptype', 'ref_id',
       'displaytext', 'zoterourl', 'ref_note', 'datbase', 'lab_id', 'lab',
       'latitude', 'longitude', 'begperiod', 'endperiod', 'samplingyear',
       'totdepth', 'sampdepth', 'station', 'samplabcode', 'species_id',
       'taxonname', 'taxonrank', 'biogroup', 'biogroup_id', 'taxondb',
       'taxondbid', 'taxondburl', 'taxonrepname', 'bodypar_id', 'bodypar',
       'sliceup', 'slicedown', 'sedtype_id', 'sedtype', 'sedrepname',
       'nuclide_id', 'nusymbol', 'volume', 'salinity', 'temperatur',
       'filtered', 'filtpore', 'samparea', 'drywt', 'wetwt', 'percentwt',
       'sampmet_id', 'sampmet', 'prepmet_id', 'prepmet', 'drymet_id', 'drymet',
       'counmet_id', 'counmet', 'decayedto', 'detection', 'activity',
       'uncertaint', 'unit_id', 'unit', 'vartype', 'freq', 'rangelow',
       'rangeupp', 'profile', 'transect_id', 'measure_note', 'shapetype_id',
       'profile_id', 'sampnote', 'ref_full

In [None]:
# samptype: group (sediment, ...)
# 

In [None]:
df['samplingyear']

0         1995
1         2009
2         2009
3         1995
4         2009
          ... 
818982    2014
818983    2014
818984    2014
818985    2014
818986    2014
Name: samplingyear, Length: 818987, dtype: int64

In [None]:
df.columns

Index(['sample_id', 'area_id', 'areaname', 'samptype_id', 'samptype', 'ref_id',
       'displaytext', 'zoterourl', 'ref_note', 'datbase', 'lab_id', 'lab',
       'latitude', 'longitude', 'begperiod', 'endperiod', 'samplingyear',
       'totdepth', 'sampdepth', 'station', 'samplabcode', 'species_id',
       'taxonname', 'taxonrank', 'biogroup', 'biogroup_id', 'taxondb',
       'taxondbid', 'taxondburl', 'taxonrepname', 'bodypar_id', 'bodypar',
       'sliceup', 'slicedown', 'sedtype_id', 'sedtype', 'sedrepname',
       'nuclide_id', 'nusymbol', 'volume', 'salinity', 'temperatur',
       'filtered', 'filtpore', 'samparea', 'drywt', 'wetwt', 'percentwt',
       'sampmet_id', 'sampmet', 'prepmet_id', 'prepmet', 'drymet_id', 'drymet',
       'counmet_id', 'counmet', 'decayedto', 'detection', 'activity',
       'uncertaint', 'unit_id', 'unit', 'vartype', 'freq', 'rangelow',
       'rangeupp', 'profile', 'transect_id', 'measure_note', 'shapetype_id',
       'profile_id', 'sampnote', 'ref_full

In [None]:
df['filtered'].unique()

array([nan, 'Y', 'N'], dtype=object)

In [None]:
df['detection'] == '='

0         True
1         True
2         True
3         True
4         True
          ... 
818982    True
818983    True
818984    True
818985    True
818986    True
Name: detection, Length: 818987, dtype: bool

In [None]:
df_sw = df[df['samptype'] == 'Seawater']
print(df_sw.shape)
df_sw = df_sw[df_sw['detection'] == '=']
print(df_sw.shape)
df_sw = df_sw[['areaname', 'latitude', 'longitude', 'nusymbol', 'samplingyear', 'activity', 'unit', 'sampdepth']]


(414051, 80)
(295107, 80)


In [None]:
df_sw.head()

Unnamed: 0,areaname,latitude,longitude,nusymbol,samplingyear,activity,unit,sampdepth
0,Indian Ocean,-31.596667,89.3725,3H,1995,0.4352,TU,608.4
1,Indian Ocean,-31.594722,109.551389,3H,2009,0.001,TU,1363.3
2,Indian Ocean,-31.594722,109.551389,3H,2009,0.001,TU,3199.5
3,Indian Ocean,-31.596667,89.3725,3H,1995,0.7237,TU,287.7
4,Indian Ocean,-31.594722,109.551389,3H,2009,0.476,TU,92.9


In [None]:
# depth = -1 reallocated to 0
# df_sw['sampdepth'][df_sw['sampdepth'] == -1] = 0

In [None]:
nuclide_of_interest = ['137Cs', '90Sr', '3H', '239,240Pu']
df_sw = df_sw[df_sw['nusymbol'].isin(nuclide_of_interest)]

In [None]:
df_sw.head()

Unnamed: 0,areaname,latitude,longitude,nusymbol,samplingyear,activity,unit,sampdepth
0,Indian Ocean,-31.596667,89.3725,3H,1995,0.4352,TU,608.4
1,Indian Ocean,-31.594722,109.551389,3H,2009,0.001,TU,1363.3
2,Indian Ocean,-31.594722,109.551389,3H,2009,0.001,TU,3199.5
3,Indian Ocean,-31.596667,89.3725,3H,1995,0.7237,TU,287.7
4,Indian Ocean,-31.594722,109.551389,3H,2009,0.476,TU,92.9


In [None]:
df['unit'].unique()

array(['TU', 'DELTA/mill', 'Bq/m3', 'Bq/kgd', 'Bq/kgw', 'atom/kg',
       'kg/kg', 'NOT AVAILABLE', 'atom/l', 'Bq/kg', 'Bq/m2', 'atom/kgd'],
      dtype=object)

In [None]:
df_sw.groupby(['nusymbol', 'unit']).size()

nusymbol   unit 
137Cs      Bq/m3    94504
239,240Pu  Bq/m3     7331
3H         Bq/m3    86194
90Sr       Bq/m3    16651
dtype: int64

In [None]:
# Conversion TU -> Bq/m3
is_TU = df_sw['unit'] == 'TU'
df_sw.loc[is_TU, 'activity'] = df_sw[is_TU]['activity'] * 119
df_sw.loc[is_TU, 'unit'] = "Bq/m3"

In [None]:
# Conversion Bq/kg -> Bq/m3
is_kg = df_sw['unit'] == 'Bq/kg'
df_sw.loc[is_kg, 'activity'] = df_sw[is_kg]['activity'] / 1025
df_sw.loc[is_kg, 'unit'] = "Bq/m3"