# WCRC
> Bundling WCRC (Woodwell Climate Research Center) cross-instruments (MIRS) cross-trial dataset

In [None]:
#| default_exp data.external.wcrc

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

In [None]:
#| export
import pandas as pd
import numpy as np
import kennard_stone as ks
from sklearn.model_selection import train_test_split
from fastcore.basics import patch
from fastcore.xtras import mkdir
from pathlib import Path
import glob
import re
from tqdm import tqdm
from typing import Dict, Callable

from spanda.readers import read_spa

In [None]:
#|export
def load_wetchem(fname:str, # folder path containing data
                 source:list[str]=['KSSL'], # name of the labs measuring. Possible values are: 'KSSL' and 'NAPT'
                ) -> pd.DataFrame:  # Loaded Wet Chemistry
    "Load wet chemistry (target)"
    df =  pd.read_csv(fname)
    return df[df['source'].isin(source)]

In [None]:
#|eval: false
fname = '../_data/wcrc-ct/RT_wetchem_soildata.csv'
df_wet = load_wetchem(fname); df_wet.head()

Unnamed: 0,sample_id,source,clay_perc,pH_H20,carbon_tot_perc,carbon_org_perc,potassium_cmolkg,phosporus_mgkg
0,RT_01,KSSL,6.28199,4.8525,0.6117,0.6117,0.26906,5.59999
1,RT_02,KSSL,4.44442,4.585,3.48602,3.48602,0.23349,19.28365
2,RT_03,KSSL,42.19991,4.23,0.25716,0.25716,0.29109,
3,RT_04,KSSL,17.10475,6.53,1.97755,1.97755,0.49925,
4,RT_05,KSSL,17.22985,6.47,2.14765,2.13584,0.59977,


In [None]:
#|eval: false
fname = '../_data/wcrc-ct/RT_STD_allMIRspectra_raw.csv'
df_spec = pd.read_csv(fname); df_spec.head()

Unnamed: 0,organization,sample_id,600,602,604,606,608,610,612,614,...,3982,3984,3986,3988,3990,3992,3994,3996,3998,4000
0,Agrocares,RT_01,2.14711,2.13471,2.12274,2.11369,2.10487,2.09844,2.09177,2.08036,...,1.09917,1.09898,1.09877,1.09851,1.09827,1.09811,1.09797,1.09797,1.09797,1.09797
1,Agrocares,RT_02,2.32404,2.33516,2.34558,2.35114,2.35641,2.35855,2.36069,2.36288,...,1.24065,1.24042,1.24015,1.23977,1.23938,1.23896,1.23861,1.23861,1.23861,1.23861
2,Agrocares,RT_03,2.50423,2.50755,2.50973,2.50403,2.49913,2.50261,2.50697,2.5291,...,1.25036,1.24959,1.24877,1.24777,1.24679,1.24585,1.24505,1.24505,1.24505,1.24505
3,Agrocares,RT_04,2.30902,2.29788,2.28667,2.2749,2.26393,2.26128,2.25905,2.26521,...,1.15192,1.15176,1.15158,1.15132,1.15106,1.1508,1.15058,1.15058,1.15058,1.15058
4,Agrocares,RT_05,2.28727,2.28852,2.29013,2.29434,2.29856,2.30286,2.307,2.30782,...,1.16109,1.16106,1.16102,1.16095,1.16086,1.1607,1.16056,1.16056,1.16056,1.16056


In [None]:
#| export
def bundle(df_wet:pd.DataFrame, # Wet chemistry
           df_spec:pd.DataFrame, # Spectra
           dest_dir:str, # Destination directory
           analytes:list[str] = ['clay_perc', 'pH_H20', 'carbon_tot_perc',
                                 'carbon_org_perc', 'potassium_cmolkg', 'phosporus_mgkg'], # Analytes of interest
           ks_on_features:bool=True, # Kennard-Stone splitting on 'features' True or 'target' otherwise
          ):
    """Bundle WCRC datasets as follows:
        - /dest_dir/
        -          /organization
        -              /analyte
        -                  /train
        -                      /sample_id
        -                          target.csv
        -                          spectrum.csv
        -                  /test
        -                      /sample_id
        -                          target.csv
        -                          spectrum.csv
    """
    for org, group in tqdm(df_spec.groupby('organization')):
        for analyte in analytes:
            # Get list of sample ids
            df_y = df_wet.dropna(subset=analyte)
            idx_smp = df_y['sample_id'].values
            
            # Select respective spectrum
            spectrum = group.set_index('sample_id').loc[idx_smp,:]
            X = spectrum.iloc[:,2:].values[:, ::-1]
            wns = spectrum.iloc[:,2:].columns[::-1]
            y = df_y[analyte].values
            
            # Train/test split using Kennard-Stone algo. on target
            #if ks_on_features:
            #    y_train, y_test, X_train, X_test, idx_train, idx_test = ks.train_test_split(y.reshape(-1, 1), 
            #                                                                                X, idx_smp, test_size=0.2)

            #else:
            #    X_train, X_test, y_train, y_test, idx_train, idx_test = ks.train_test_split(X, y, 
                                                                                            #idx_smp, test_size=0.2)
            X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(X, y, 
                                                                                     idx_smp, test_size=0.2,
                                                                                     random_state=42)
            for ds, (idxs, X, y) in {'train': (idx_train, X_train, y_train),
                                     'test': (idx_test, X_test, y_test)}.items():
                for i, idx in enumerate(idxs):
                    path = Path(dest_dir) / org / analyte / ds / idx
                    mkdir(path, parents=True, overwrite=True)
                    
                    # Write spectrum
                    spectrum = pd.Series(data=X[i,:], index=wns, name='absorbance')
                    spectrum.index.name = 'wavenumber'
                    out_fname = path / 'spectrum.csv'
                    spectrum.to_csv(out_fname)
                    
                    # Write target
                    target = pd.Series(y[i], index=[analyte])
                    target.index.name = 'analyte'
                    target.name = 'value'
                    out_fname = path / 'target.csv'
                    target.to_csv(out_fname)



In [None]:
#|eval: false
fname = '../_data/wcrc-ct/RT_wetchem_soildata.csv'
bundle(load_wetchem(fname), df_spec, dest_dir = '../_data/wcrc-ct-bundled',
       ks_on_features=False)

100%|██████████████████████████████████████████████████████████████████| 16/16 [00:07<00:00,  2.25it/s]
