# IAEA
> Bundling Joint FAO/IAEA center MIRS measurements

In [None]:
#| default_exp data.external.kssl

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

In [None]:
#| export
#import subprocess
import pandas as pd
from fastcore.basics import patch
from fastcore.xtras import mkdir
from pathlib import Path
import glob
import re
from tqdm import tqdm
from typing import Dict, Callable

from spanda.readers import read_spa
from spanda.sig import interp

In [None]:
#| export
def load_wetchem(fname_target:str, # Excel file containing wet chemistry measurements
                 col_names:list[str]=[
                     'id', 'year', 'crops', 'kd_cs133', 
                     'ex_cs_total_cs_ratio', 'cs137', 'cs137_total', 
                     'ex_cs137', 'ex_k2o', 'ph', 'c',
                     'n', 'c_n_ratio', 'ex_mgo', 'ex_cao', 
                     'cec', 'pretreatment'], # Wet chemistry excel file column names
                 kwargs:dict={'skiprows': 3}) -> pd.DataFrame:  # Loaded Wet Chemistry
    "Load wet chemistry measurements"
    df = pd.read_excel(fname_target, **kwargs)
    df.columns = col_names
    return df

In [None]:
df = load_wetchem('../_data/seibersdorf/Fukushima_soil_221219.xlsx'); df

Unnamed: 0,id,year,crops,kd_cs133,ex_cs_total_cs_ratio,cs137,cs137_total,ex_cs137,ex_k2o,ph,c,n,c_n_ratio,ex_mgo,ex_cao,cec,pretreatment
0,20,2013,paddy_rice,140.948011,0.115672,5.117416,610.028484,70.563481,17.6,5.7,4.46,0.36,12.0,64.1,339.0,29.5,ground
1,28,2014,paddy_rice,69.323903,0.101520,1.163779,273.488002,27.764435,62.1,5.3,1.39,0.12,12.0,30.3,217.0,19.6,ground
2,33,2014,paddy_rice,34.888567,0.127219,0.498762,28.096944,3.574463,22.3,6.0,1.42,0.12,12.0,38.1,96.1,13.8,ground
3,35,2014,paddy_rice,26.047261,0.079537,2.244430,897.772163,71.406138,33.6,5.4,2.86,0.24,12.0,16.2,119.0,15.4,ground
4,36,2014,paddy_rice,23.944595,0.093966,1.662541,964.273805,90.608487,57.0,5.4,3.11,0.26,12.0,19.9,151.0,17.7,ground
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171,295,2019,paddy_rice,26.167915,0.079775,1.110326,1660.823169,132.492635,27.7,5.2,2.14,0.39,5.5,25.0,97.0,17.0,
172,296,2019,paddy_rice,2.649075,0.010654,0.475854,998.359995,10.636733,35.5,6.4,2.64,0.44,6.0,69.0,300.0,21.0,excluded at first
173,297,2019,paddy_rice,1.337928,0.007500,0.237927,1082.334200,8.117506,32.2,6.4,2.20,0.38,5.8,66.0,260.0,17.0,excluded at first
174,298,2019,paddy_rice,0.963191,0.007679,0.475854,1567.518496,12.036303,33.5,5.7,1.84,0.34,5.4,37.0,180.0,16.0,excluded at first


In [None]:
#| export
def get_replicates(src_dir_spectra, pattern):
    fnames = glob.glob(src_dir_spectra+'/*.SPA')
    spectra_paths = [Path(f) for f in fnames if re.match(pattern, Path(f).name)]
    return spectra_paths

In [None]:
get_replicates('../_data/seibersdorf/spectra', pattern='20[a-z]')

[Path('../_data/seibersdorf/spectra/20b.SPA'),
 Path('../_data/seibersdorf/spectra/20c.SPA'),
 Path('../_data/seibersdorf/spectra/20a.SPA'),
 Path('../_data/seibersdorf/spectra/20d.SPA')]

In [None]:
#| export
def bundle(df:pd.DataFrame, # Wet chemistry
           src_dir_spectra:str, # Directory containing infrared spectra
           fn_get_replicates:Callable, # Get list of path of replicates given a regexp pattern
           dest_dir:str, # Destination directory
           analytes:list[str]=['ph', 'cec'], # Analytes of interest
           wn_range:tuple=(4000, 650), # Max and min wavenumbers 
           compressed=True, # True if folder should be compressed
          ):
    """Bundle IAEA dataset as follows:

        - /dest_dir/
        -          /smp_id/
        -                  target.csv
        -                  spectrum_replicate_XX.csv
    """
    dest_dir = Path(dest_dir)        
    mkdir(dest_dir, overwrite=True)
    
    # iterate and create tree structure
    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        path = dest_dir / str(row['id'])
        if not path.exists():
            path.mkdir()
            target = row[['id'] + analytes]
            target.index.name = 'analyte'
            target.name = 'value'
            target.to_csv(path/'target.csv')
            
        # read, process and bundle spectra    
        fnames = fn_get_replicates(src_dir_spectra, pattern=str(row['id'])+'[a-z]')
        for fname in fnames:
            absorbance, wn = interp(read_spa(fname), between=wn_range)
            spectrum = pd.Series(data=absorbance, index=wn, name='absorbance')
            spectrum.index.name = 'wavenumber'
            out_fname = str(path/fname.stem) + '.csv'
            spectrum.to_csv(out_fname)

In [None]:
#|eval: false
analytes = ['kd_cs133', 'ex_cs_total_cs_ratio', 'cs137', 'cs137_total', 
            'ex_cs137', 'ex_k2o', 'ph', 'c',
            'n', 'c_n_ratio', 'ex_mgo', 'ex_cao', 'cec']
df = load_wetchem('../_data/seibersdorf/Fukushima_soil_221219.xlsx')
bundle(df,
       src_dir_spectra='../_data/seibersdorf/spectra',
       fn_get_replicates=get_replicates,
       dest_dir='../_data/seibersdorf-mirs',
       wn_range=(4000, 600),
       analytes=analytes)

100%|██████████████████████████████████████████████████████████████| 176/176 [00:02<00:00, 61.29it/s]
