# Dataloaders

> Custom PyTorch Datasets and Dataloaders.

In [35]:
# | default_exp dataloaders

In [107]:
# | export
from pathlib import Path
from tqdm import tqdm
from collections import namedtuple
import fastcore.all as fc

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch import tensor

In [38]:
# | eval: false
# | hide
from lssm.loading import load_ossl
from lssm.preprocessing import ToAbsorbance, ContinuumRemoval, SNV
from lssm.visualization import plot_spectra
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

In [39]:
# | hide
from nbdev import show_doc

In [40]:
# | export
def get_dls(
    train_ds:Dataset, # Train dataset
    valid_ds:Dataset, # Valid dataset
    bs:int, # Batch size 
    **kwargs
    ) -> DataLoader:  # NamedTuple `DataLoader`with `.train` and `.valid` field names
    "Train and valid dataloaders."
    Dataloaders = namedtuple('Dataloader', ['train', 'valid'])
    return Dataloaders(
        DataLoader(train_ds, batch_size=bs, shuffle=True, **kwargs),
        DataLoader(valid_ds, batch_size=bs*2, **kwargs))


In [106]:
# | export
class CrossSpectraDataset(Dataset):
    "Custom Pytorch dataset for IR instrument cross-calibration."
    def __init__(self, 
                 df:pd.DataFrame, # Infrared spectra with associated sample id
                 pair_idxs:list, # List of tuples (pairs) of replicate indices to sample from the dataframe 
                 spectra_at:int=2 # Index of the column where the spectroscopy data starts
                 ):
        self.data = tensor(df.iloc[:, spectra_at:].to_numpy(),
                           dtype=torch.float32)
        self.pair_idxs = [list(p) for p in pair_idxs]

    def __len__(self):
        return len(self.pair_idxs)

    def __getitem__(self, idx):
        X, y = self.data[self.pair_idxs[idx]]
        return X[None, :], y[None, :]

For example:

In [108]:
from lssm.loading import get_spectra_pair_idxs

df_test = pd.DataFrame({'sample_id': [0,0,1,1], 
                        'organization': ['A', 'B', 'A', 'B'],
                        '650': [1.2, 0.6, 0.5, 1.3],
                        '652': [0.8, 0.4, 0.6, 1.1]
                        }, index= [0,1,2,3])
df_test.index.name = 'index'; df_test

Unnamed: 0_level_0,sample_id,organization,650,652
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,A,1.2,0.8
1,0,B,0.6,0.4
2,1,A,0.5,0.6
3,1,B,1.3,1.1


In [109]:
get_spectra_pair_idxs(df_test)

[(0, 0), (0, 1), (1, 0), (1, 1), (2, 2), (2, 3), (3, 2), (3, 3)]

In [104]:
ds = CrossSpectraDataset(df_test, get_spectra_pair_idxs(df_test))

# Index pair: (0, 0)
x, y = tensor([[1.2, 0.8]]), tensor([[1.2, 0.8]])
fc.test_eq(ds[0], (x, y))

# Index pair: (0, 1)
x, y = tensor([[1.2, 0.8]]), tensor([[0.6, 0.4]])
fc.test_eq(ds[1], (x, y))

# Index pair: (1, 0)
x, y = tensor([[0.6, 0.4]]), tensor([[1.2, 0.8]])
fc.test_eq(ds[2], (x, y))

# Index pair: (1, 1)
x, y = tensor([[0.6, 0.4]]), tensor([[0.6, 0.4]])
fc.test_eq(ds[3], (x, y))

# Index pair: (2, 2)
x, y = tensor([[0.5, 0.6]]), tensor([[0.5, 0.6]])
fc.test_eq(ds[4], (x, y))

# Index pair: (2, 3)
x, y = tensor([[0.5, 0.6]]), tensor([[1.3, 1.1]])
fc.test_eq(ds[5], (x, y))

In [14]:
# | export
class SpectralDataset(Dataset):
    "(Infrared Spectra, soil property) custom PyTorch Dataset."

    def __init__(self,
                 X:np.ndarray, # Spectra
                 y:np.ndarray, # Analyte
                 metadata:np.ndarray=None
                 ):
        self.X = X
        self.y = y
        self.metadata = metadata

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return (tensor(self.X[idx, :][None, :], dtype=torch.float32),
                tensor(self.y[idx, :], dtype=torch.float32))

For example, below a canonical pipeline where we:

1. load the data
2. transform the data (here to absorbance and continuum removal)
3. perform a train, test split
4. access Pytorch custom `SpectralDataset`
5. finally get PyTorch dataloaders ready for training

In [19]:
# |eval: false

# 1. Data loading
analytes = 'k.ext_usda.a725_cmolc.kg'
data = load_ossl(analytes, spectra_type='visnir')
X, y, X_names, smp_idx, ds_name, ds_label = data

# 2. Transform
X = Pipeline([('to_abs', ToAbsorbance()),
              ('cr', ContinuumRemoval(X_names))]).fit_transform(X)

# 3. Train/valid split
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      test_size=0.2,
                                                      stratify=ds_name,
                                                      random_state=41)

# 4. Get PyTorch datasets
train_ds, valid_ds = [SpectralDataset(X, y)
                      for X, y, in [(X_train, y_train), (X_valid, y_valid)]]

# 5. Then PyTorch dataloaders
dls = get_dls(train_ds, valid_ds, bs=32)

first_batch = next(iter(dls.train))
print(f'First batch X dim: {first_batch[0].shape}')
print(f'First batch y dim: {first_batch[1].shape}')


Reading & selecting data ...


100%|██████████| 44489/44489 [00:16<00:00, 2758.44it/s]


First batch X dim: torch.Size([32, 1, 1051])
First batch y dim: torch.Size([32, 1])


In [20]:
# | export
class SpectralEmbeddingDataset(Dataset):
    "(Infrared Spectra, Infrared Spectra) custom PyTorch Dataset."

    def __init__(self,
                 X:np.ndarray, # Spectra
                 ):
        self.X = X

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return (tensor(self.X[idx, :][None, :], dtype=torch.float32),
                tensor(self.X[idx, :][None, :], dtype=torch.float32))

In [None]:
dd