# Loading

> Load data from spectral libraries.

In [None]:
#| default_exp loading

In [None]:
#| export
import fastcore.all as fc
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
import numpy as np
import re

In [None]:
#| exports
fname_ossl = Path.home() / '.lssm/data/ossl/ossl_all_L0_v1.2.csv.gz'

In [None]:
#| exports
class OSSLLoader:
    "Load OSSL data and filter it by spectra type and analytes of interest."
    CFGS = {
        'visnir': {'ref_col': 'scan_visnir.1500_ref', 'range': [400, 2500]},
        'mir': {'ref_col': 'scan_mir.1500_abs', 'range': [600, 4000]}
    }
    
    DTYPE_DICT = {
        'id.layer_local_c': 'object',
        'id.location_olc_txt': 'object',
        'id.dataset.site_ascii_txt': 'object',
        'id.scan_local_c': 'object',
        'layer.texture_usda_txt': 'object',
        'pedon.taxa_usda_txt': 'object',
        'horizon.designation_usda_txt': 'object',
        'location.country_iso.3166_txt': 'object',
        'surveyor.address_utf8_txt': 'object',
        'efferv_usda.a479_class': 'object',
        'scan.mir.date.begin_iso.8601_yyyy.mm.dd': 'object',
        'scan.mir.date.end_iso.8601_yyyy.mm.dd': 'object',
        'scan.mir.model.name_utf8_txt': 'object',
        'scan.mir.model.code_any_txt': 'object',
        'scan.mir.method.optics_any_txt': 'object',
        'scan.mir.method.preparation_any_txt': 'object',
        'scan.mir.license.title_ascii_txt': 'object',
        'scan.mir.license.address_idn_url': 'object',
        'scan.mir.doi_idf_url': 'object',
        'scan.mir.contact.name_utf8_txt': 'object',
        'scan.mir.contact.email_ietf_txt': 'object',
        'scan.visnir.date.begin_iso.8601_yyyy.mm.dd': 'object',
        'scan.visnir.date.end_iso.8601_yyyy.mm.dd': 'object',
        'scan.visnir.model.name_utf8_txt': 'object',
        'scan.visnir.model.code_any_txt': 'object',
        'scan.visnir.method.optics_any_txt': 'object',
        'scan.visnir.method.preparation_any_txt': 'object',
        'scan.visnir.license.title_ascii_txt': 'object',
        'scan.visnir.license.address_idn_url': 'object',
        'scan.visnir.doi_idf_url': 'object',
        'scan.visnir.contact.name_utf8_txt': 'object',
        'scan.visnir.contact.email_ietf_txt': 'object'
    }

    def __init__(self, fname: Path = Path.home() / '.lssm/data/ossl/ossl_all_L0_v1.2.csv.gz'):
        self.fname = fname
        self.df = None
        self.ds_name_encoder = LabelEncoder()

    def load_data(self, analytes: str|list, spectra_type: str = 'visnir', debug: bool = False):
        """Load OSSL data and filter it by spectra type and analytes of interest"""
        print(f'Loading data from {self.fname} ...')
        self.df = pd.read_csv(self.fname, dtype=self.DTYPE_DICT,
                              compression='infer', low_memory=True)

        if debug:
            return self.df

        analytes = [analytes] if isinstance(analytes, str) else analytes
        subset = analytes + [self.CFGS[spectra_type]['ref_col']]
        self.df = self.df.dropna(subset=subset, how='any')

        X = self._get_spectra(spectra_type)
        y = self.df[analytes].values
        smp_idx = self.df['id.layer_uuid_txt'].values
        ds_name = self._encode_dataset_names()
        X_names = self._get_wavelengths(spectra_type)

        return X, y, X_names, smp_idx, ds_name, self.ds_name_encoder.classes_

    def _get_spectra(self, spectra_type: str):
        cols_ref = [name for name in self.df.columns if f'scan_{spectra_type}.' in name]
        X = self.df[cols_ref].values
        lower_limit, upper_limit = self.CFGS[spectra_type]['range']
        X_names = self._get_wavelengths(spectra_type)
        idxs = np.where((X_names >= lower_limit) & (X_names <= upper_limit))[0]
        return X[:, idxs]

    def _encode_dataset_names(self):
        return self.ds_name_encoder.fit_transform(self.df['dataset.code_ascii_txt'])

    def _get_wavelengths(self, spectra_type: str):
        pattern = r"scan_{}\.(\d+)_".format(spectra_type)
        return np.array([int(re.search(pattern, name).group(1)) for name in self.df.columns
                         if re.search(pattern, name)])


Usage example:

In [None]:
#| eval: false
loader = OSSLLoader()
analytes = 'k.ext_usda.a725_cmolc.kg'
X, y, wavenumbers, smp_idx, ds_name, ds_label = loader.load_data(analytes, 
                                                                 spectra_type='mir')

print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')
print(f'wavenumbers: {wavenumbers}')
print(f'smp_idx: {smp_idx}')
print(f'ds_name: {ds_name}')
print(f'ds_label: {ds_label}')

Loading data from /Users/franckalbinet/.lssm/data/ossl/ossl_all_L0_v1.2.csv.gz ...
X shape: (57674, 1701)
y shape: (57674, 1)
wavenumbers: [ 600  602  604 ... 3996 3998 4000]
smp_idx: ['3998362dd2659e2252cd7f38b43c9b1f' '2bab4dbbac073b8648475ad50d40eb95'
 '29213d2193232be8867d85dec463ec00' ... 'b790da349d49885c5727a2b5fd67b13d'
 'a057a7ead9eebce24d4039de7fd5e01b' '80bf4a0dc30f60552a38193d5c09b9cd']
ds_name: [0 0 0 ... 3 3 3]
ds_label: ['GARRETT.SSL' 'ICRAF.ISRIC' 'KSSL.SSL' 'LUCAS.WOODWELL.SSL']


In [None]:
#| eval: false
#| hide
# For further testing, we save the first 10 samples of the dataset
n_samples = 10
fc.save_pickle('./files/spectrum-and-all.pkl', (X[:n_samples,:], y[:n_samples], wavenumbers, smp_idx[:n_samples], 
                                                ds_name[:n_samples], ds_label[:n_samples]))