# Post-processing MARIS seawater dump

In [None]:
import pandas as pd
from fastcore.xtras import L
from pathlib import Path

In [None]:
fname = 'pro/data/maris/2023-03-28 MARIS all seawater data.xlsx'
df = pd.read_excel(Path.home() / fname)

In [None]:
df.columns

Index(['sample_id', 'area_id', 'areaname', 'samptype_id', 'samptype', 'ref_id',
       'displaytext', 'zoterourl', 'ref_note', 'datbase', 'lab_id', 'lab',
       'latitude', 'longitude', 'begperiod', 'endperiod', 'samplingyear',
       'totdepth', 'sampdepth', 'station', 'samplabcode', 'species_id',
       'taxonname', 'taxonrank', 'biogroup', 'taxondb', 'taxondbid',
       'taxondburl', 'taxonrepname', 'bodypar_id', 'bodypar', 'sliceup',
       'slicedown', 'sedtype_id', 'sedtype', 'sedrepname', 'nuclide_id',
       'nusymbol', 'volume', 'salinity', 'temperatur', 'filtered', 'filtpore',
       'samparea', 'drywt', 'wetwt', 'percentwt', 'sampmet_id', 'sampmet',
       'prepmet_id', 'prepmet', 'drymet_id', 'drymet', 'counmet_id', 'counmet',
       'decayedto', 'detection', 'activity', 'uncertaint', 'unit_id', 'unit',
       'vartype', 'freq', 'rangelow', 'rangeupp', 'profile', 'transect_id',
       'measure_note', 'shapetype_id', 'profile_id', 'sampnote',
       'ref_fulltext', 'ref_yea

In [None]:
df.head()

Unnamed: 0,sample_id,area_id,areaname,samptype_id,samptype,ref_id,displaytext,zoterourl,ref_note,datbase,...,ref_yearpub,ref_sampleTypes,LongLat,displaycoordinates,DisplayLong,DisplayLat,id,activity_corr,uncertaint_corr,unit_corr
0,602942,1904,Indian Ocean,1,Seawater,402,"CCHDO, 2018",https://www.zotero.org/groups/2432820/maris/it...,"Data downloaded from: Oms (2018), Tritium in o...",,...,1984,1,"45.958,-5.167",0xE6100000010CB4E55C8AABAA14C0DFE00B93A9FA4640,45.958333,-5.166667,1,34.81,0.354,Bq/m3
1,589715,1904,Indian Ocean,1,Seawater,402,"CCHDO, 2018",https://www.zotero.org/groups/2432820/maris/it...,"Data downloaded from: Oms (2018), Tritium in o...",,...,1984,1,"110.59,-24.379",0xE6100000010C371AC05B206138C0F6285C8FC2A55B40,110.589722,-24.379444,2,13.334,0.236,Bq/m3
2,594043,1904,Indian Ocean,1,Seawater,402,"CCHDO, 2018",https://www.zotero.org/groups/2432820/maris/it...,"Data downloaded from: Oms (2018), Tritium in o...",,...,1984,1,"104.167,-31.516",0xE6100000010C87A757CA32843FC0A69BC420B00A5A40,104.166667,-31.516389,3,5.0858,0.2714,Bq/m3
3,603191,1904,Indian Ocean,1,Seawater,402,"CCHDO, 2018",https://www.zotero.org/groups/2432820/maris/it...,"Data downloaded from: Oms (2018), Tritium in o...",,...,1984,1,"115,-52",0xE6100000010C0000000000004AC00000000000C05C40,115.0,-52.0,4,4.248,0.472,Bq/m3
4,596545,1904,Indian Ocean,1,Seawater,402,"CCHDO, 2018",https://www.zotero.org/groups/2432820/maris/it...,"Data downloaded from: Oms (2018), Tritium in o...",,...,1984,1,"93.614,-33.418",0xE6100000010C68226C787AB540C0ECC039234A675740,93.613889,-33.417778,5,101.716,0.4248,Bq/m3


In [None]:
cols_of_interest = ['sample_id', 'DisplayLat', 'DisplayLong',
                    'sampdepth', 'profile_id', 'begperiod',
                    'nusymbol', 'unit_corr','activity_corr',
                    'uncertaint_corr', 'ref_id'] 

cols_name = ['smp_id', 'lat', 'lon', 
             'depth', 'profile_id', 'begperiod', 
             'nusymbol', 'unit', 'activity',
             'uncertainty', 'ref_id']

In [None]:
df_selected = df[cols_of_interest]

In [None]:
df_selected.columns = cols_name

In [None]:
df_selected.head()

Unnamed: 0,smp_id,lat,lon,depth,profile_id,begperiod,nusymbol,unit,activity,uncertainty,ref_id
0,602942,-5.166667,45.958333,352.6,402.2178,1996-01-18 18:20:00,3H,Bq/m3,34.81,0.354,402
1,589715,-24.379444,110.589722,756.2,402.805,1995-11-13 02:52:00,3H,Bq/m3,13.334,0.236,402
2,594043,-31.516389,104.166667,1013.6,402.1293,1995-04-10 10:51:00,3H,Bq/m3,5.0858,0.2714,402
3,603191,-52.0,115.0,607.2,402.22,1995-01-08 02:54:00,3H,Bq/m3,4.248,0.472,402
4,596545,-33.417778,93.613889,138.9,402.1531,1995-04-06 00:00:00,3H,Bq/m3,101.716,0.4248,402


In [None]:
df_selected = df_selected[df_selected['activity'] >= 0]

In [None]:
df_selected = df_selected[df_selected['depth'] >= 0]

In [None]:
df_selected[df_selected['nusymbol'] == '137Cs']['activity'].max()

68000000.0

In [None]:
df_selected.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 381189 entries, 0 to 400736
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   smp_id       381189 non-null  int64         
 1   lat          381189 non-null  float64       
 2   lon          381189 non-null  float64       
 3   depth        381189 non-null  float64       
 4   profile_id   116070 non-null  float64       
 5   begperiod    381189 non-null  datetime64[ns]
 6   nusymbol     381189 non-null  object        
 7   unit         381189 non-null  object        
 8   activity     381189 non-null  float64       
 9   uncertainty  181842 non-null  float64       
 10  ref_id       381189 non-null  int64         
dtypes: datetime64[ns](1), float64(6), int64(2), object(2)
memory usage: 34.9+ MB


In [None]:
out_fname = Path.home() / 'pro/data/maris/maris-seawater-2023-04-15.csv'
df_selected.to_csv(out_fname, index=False)