# WCRC

> Various Transforms to be piped to create a DataLoader

In [None]:
#| default_exp data.transforms.wcrc

In [None]:
#| export
from __future__ import annotations
from fastai.vision.all import *
from fastai.data.all import *
from fastcore.basics import patch
from pathlib import Path
import pandas as pd
from tqdm import tqdm

In [None]:
#| hide
from nbdev.showdoc import *
from nbdev.cli import *

## Input (spectra)

In [None]:
#|eval: false
org = 'Woodwell_vertex'
analyte = 'potassium_cmolkg'
ds = 'train'
path = Path('../_data/wcrc-ct-bundled') / org / analyte / ds; path

Path('../_data/wcrc-ct-bundled/Woodwell_vertex/potassium_cmolkg/train')

In [None]:
path.ls()

(#48) [Path('../_data/wcrc-ct-bundled/Woodwell_vertex/potassium_cmolkg/train/RT_23'),Path('../_data/wcrc-ct-bundled/Woodwell_vertex/potassium_cmolkg/train/RT_24'),Path('../_data/wcrc-ct-bundled/Woodwell_vertex/potassium_cmolkg/train/RT_12'),Path('../_data/wcrc-ct-bundled/Woodwell_vertex/potassium_cmolkg/train/RT_15'),Path('../_data/wcrc-ct-bundled/Woodwell_vertex/potassium_cmolkg/train/RT_41'),Path('../_data/wcrc-ct-bundled/Woodwell_vertex/potassium_cmolkg/train/RT_46'),Path('../_data/wcrc-ct-bundled/Woodwell_vertex/potassium_cmolkg/train/RT_48'),Path('../_data/wcrc-ct-bundled/Woodwell_vertex/potassium_cmolkg/train/RT_14'),Path('../_data/wcrc-ct-bundled/Woodwell_vertex/potassium_cmolkg/train/RT_13'),Path('../_data/wcrc-ct-bundled/Woodwell_vertex/potassium_cmolkg/train/RT_22')...]

In [None]:
#paths = L()
#for p in tqdm(path.ls()):
#    is_analyte_pos = AnalytesTfm(analytes=[725])(p).item() > 0
#    has_spectra = len(get_spectra_files(p)) > 0
#    if is_analyte_pos and has_spectra: paths.append(p)

In [None]:
#| export
@Transform
def get_spectra_files(path:Path, # Directory that contains spectra replicates and wet chemistry
                     ) -> L: # List of spectra replicates (if any) file paths in the directory
    "Return list of paths to spectra `.csv` files"
    return L(fname for fname in path.ls() if re.match('spectrum', fname.name))

In [None]:
#|eval: false
get_spectra_files(path.ls()[0])

(#1) [Path('../_data/wcrc-ct-bundled/Woodwell_vertex/potassium_cmolkg/train/RT_23/spectrum.csv')]

In [None]:
wn = np.arange(4000, 600, -2); wn

array([4000, 3998, 3996, ...,  606,  604,  602])

In [None]:
#| export
class Spectra(fastuple):
    def show(self, ctx=None, figsize=(12,4), **kwargs):
        spectra, wns, smp_id = self
        if ctx is None: _,ctx = plt.subplots(figsize=figsize)
        ctx.set_xlim(np.max(wns), np.min(wns))
        ctx.set(xlabel='Wavenumber', ylabel='Absorbance')
        ctx.set_axisbelow(True)
        for spectrum in spectra:
            ax = ctx.plot(wns, spectrum, c='steelblue', lw=1)
        ctx.grid(True, which='both')
        ctx.set_title(f'Sample ID: {smp_id}')
        return ctx;

In [None]:
Path('../_data/wcrc-ct-bundled/Woodwell_vertex/potassium_cmolkg/train/RT_23/spectrum.csv').parent.stem

'RT_23'

In [None]:
#| export
#class SpectraTransform(Transform):
#    def __init__(self, paths):
#        self.paths = L(fname for fname in paths 
#                       if re.match('spectrum', fname.name))
#        self.df = pd.read_csv(self.paths[0]) 
#        self.wns = self.df['wavenumber'].values
#        self.smp_id = paths[0].parent.stem
#        self.n = self.df.shape[0]
#        self.m = len(paths)

#    def encodes(self, f):
#        x = np.empty((self.m, self.n))
#        for i, fname in enumerate(f):
#            x[i,:] = pd.read_csv(fname)['absorbance'].values
#        return Spectra(x, self.wns, self.smp_id)

In [None]:
#path.ls()

In [None]:
#paths = L([Path('../_data/wcrc-ct-bundled/Woodwell_vertex/potassium_cmolkg/train/RT_22/spectrum.csv')])
#path
#tfm = SpectraTransform(path.ls())
#tfm(paths).show();

In [None]:
#| export
@Transform
def to_spectra(paths:L, # List of spectra file paths
              ) -> torch.Tensor: # Spectra
    "Transform list of spectra file paths to a torch array of dimension: (n_replicates, n_wavenumbers)"
    df = pd.read_csv(paths[0]) 
    wns = df['wavenumber'].values
    smp_id = paths[0].parent.stem
    n = df.shape[0]
    m = len(paths)
    x = np.empty((m,n))
    for i, fname in enumerate(paths):
        x[i,:] = pd.read_csv(fname)['absorbance'].values
    #return Spectra(torch.Tensor(x), wns, smp_id)
    return torch.Tensor(x)

In [None]:
#|eval: false
#tls = TfmdLists(path.ls(), [get_spectra_files, to_spectra, snv])
##spectraTransform = SpectraTransform(
tls = TfmdLists(path.ls(), [get_spectra_files, to_spectra, snv])
tls[0][0]

tensor([-1.6334, -1.6329, -1.6325,  ...,  1.5224,  1.5367,  1.7113])

In [None]:
#| export
@Transform
def snv(x:torch.Tensor, # Spectrum 
       ) -> Tensor: # Standard Normal Variate of spectrum
    "Standard Normal Variate Transform of input spectrum"
    mean, std = torch.mean(x), torch.std(x)
    return (x - mean)/std

## Target (Soil properties)

In [None]:
#| export
class Analyte(Tensor): 
    def show(self, ctx=None, **kwargs):
        analytes = self
        print(analytes)

In [None]:
#| export
class AnalytesTfm(Transform):
    def __init__(self, 
                 analytes:list|None=None): 
        self.analytes = analytes

In [None]:
#| export
@AnalytesTfm
def encodes(self, 
            path: Path, # Path to directory containing both spectra and analyte(s) measurement
           ):
    "Transform a path to a directory into a tensor of soil analyte(s) measurement"
    path_target = [f for f in path.ls() if re.match('target', f.name)][0]
    df = pd.read_csv(path_target)
    if self.analytes:
        df = df[df.analyte.isin(self.analytes)]
    return Analyte(df['value'].values)

In [None]:
#|eval: false
AnalytesTfm(analytes=['potassium_cmolkg'])(path.ls()[0])

Analyte([0.1896])

In [None]:
#|eval: false
# Or as a TfmdLists pipeline
tls = TfmdLists(path.ls(), [AnalytesTfm(analytes=['potassium_cmolkg']), torch.log10])
tls[0]

Analyte([-0.7222])

In [None]:
#t = np.array([AnalytesTfm(analytes=['potassium_cmolkg'])(fname).item() for fname in path.ls()])
#np.log10(t.min()), np.log10(t.max())

(nan, nan)

## How to use these transforms?

1. First create two // pipes (one for the features and one for the targets):

In [None]:
#|eval: false
x_tfms = [get_spectra_files, to_spectra, snv]
y_tfms = [AnalytesTfm(analytes=['potassium_cmolkg']), torch.log10]

2. Create your splits and create a Fastai `Datasets`:

In [None]:
#|eval: false
splits = RandomSplitter(seed=42)(path.ls())
dsets = Datasets(path.ls(), [x_tfms, y_tfms], splits=splits)

3. Then you get your Dataloader:

In [None]:
#|eval: false
dls = dsets.dataloaders(bs=16)

In [None]:
#|eval: false
dls.train.one_batch()[0]

tensor([[[-1.4117, -1.4118, -1.4119,  ...,  1.7971,  1.7349,  1.7549]],

        [[-1.4763, -1.4756, -1.4752,  ...,  1.6744,  1.7781,  1.7649]],

        [[-1.5275, -1.5270, -1.5265,  ...,  1.5975,  1.6960,  1.7508]],

        ...,

        [[-1.3665, -1.3660, -1.3656,  ...,  2.1362,  2.0951,  1.9890]],

        [[-1.5171, -1.5170, -1.5164,  ...,  2.0831,  1.8169,  1.7904]],

        [[-1.5781, -1.5779, -1.5778,  ...,  1.1844,  1.4068,  1.3993]]])

In [None]:
#|eval: false
dls.train.one_batch()[1]

Analyte([[-0.0967],
         [-0.2080],
         [-0.2495],
         [-0.5222],
         [-0.4369],
         [-0.2424],
         [-1.0660],
         [-0.2549],
         [-0.7222],
         [-0.3017],
         [-0.0274],
         [-0.9488],
         [ 0.0266],
         [-0.0143],
         [-0.4307],
         [-0.3165]])

In [None]:
#for s, a in dls.train:
#    print(s.shape)
#    print(a.shape)

In [None]:
#dls.show_batch(max_n=4)