# Loading

> Load data from spectral libraries.

In [None]:
#| default_exp loading

In [None]:
#| export
import fastcore.all as fc
from fastcore.basics import patch
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
import numpy as np
import re

In [None]:
#| exports
fname_ossl = Path.home() / '.lssm/data/ossl/ossl_all_L0_v1.2.csv.gz'

In [None]:
#| exports
class OSSLLoader:
    "Load OSSL data and filter it by spectra type and analytes of interest."
    DTYPE_DICT = {
        'id.layer_local_c': 'object',
        'id.location_olc_txt': 'object',
        'id.dataset.site_ascii_txt': 'object',
        'id.scan_local_c': 'object',
        'layer.texture_usda_txt': 'object',
        'pedon.taxa_usda_txt': 'object',
        'horizon.designation_usda_txt': 'object',
        'location.country_iso.3166_txt': 'object',
        'surveyor.address_utf8_txt': 'object',
        'efferv_usda.a479_class': 'object',
        'scan.mir.date.begin_iso.8601_yyyy.mm.dd': 'object',
        'scan.mir.date.end_iso.8601_yyyy.mm.dd': 'object',
        'scan.mir.model.name_utf8_txt': 'object',
        'scan.mir.model.code_any_txt': 'object',
        'scan.mir.method.optics_any_txt': 'object',
        'scan.mir.method.preparation_any_txt': 'object',
        'scan.mir.license.title_ascii_txt': 'object',
        'scan.mir.license.address_idn_url': 'object',
        'scan.mir.doi_idf_url': 'object',
        'scan.mir.contact.name_utf8_txt': 'object',
        'scan.mir.contact.email_ietf_txt': 'object',
        'scan.visnir.date.begin_iso.8601_yyyy.mm.dd': 'object',
        'scan.visnir.date.end_iso.8601_yyyy.mm.dd': 'object',
        'scan.visnir.model.name_utf8_txt': 'object',
        'scan.visnir.model.code_any_txt': 'object',
        'scan.visnir.method.optics_any_txt': 'object',
        'scan.visnir.method.preparation_any_txt': 'object',
        'scan.visnir.license.title_ascii_txt': 'object',
        'scan.visnir.license.address_idn_url': 'object',
        'scan.visnir.doi_idf_url': 'object',
        'scan.visnir.contact.name_utf8_txt': 'object',
        'scan.visnir.contact.email_ietf_txt': 'object'
    }

    def __init__(self, 
                 fname: Path = Path.home() / '.lssm/data/ossl/ossl_all_L0_v1.2.csv.gz', # Data source file name
                 cfgs: dict = None): # Spectra type configuration
        self.fname = fname
        self.df = None
        self.ds_name_encoder = LabelEncoder()
        self.cfgs = cfgs or {
            'visnir': {'ref_col': 'scan_visnir.1500_ref', 'range': [400, 2500]},
            'mir': {'ref_col': 'scan_mir.1500_abs', 'range': [650, 4000]}
        }


In [None]:
@patch
def load_data(self:OSSLLoader, 
                analytes: str|list, # Analytes of interest
                spectra_type: str = 'visnir', # Spectra type
                debug: bool = False # Debug mode
                ) -> tuple: # Return a tuple of the form (X, y, X_names, smp_idx, ds_name, ds_label)
    "Load OSSL data and filter it by spectra type and analytes of interest."
    print(f'Loading data from {self.fname} ...')
    self.df = pd.read_csv(self.fname, dtype=self.DTYPE_DICT,
                            compression='infer', low_memory=True)

    if debug:
        return self.df

    analytes = [analytes] if isinstance(analytes, str) else analytes
    subset = analytes + [self.cfgs[spectra_type]['ref_col']]
    self.df = self.df.dropna(subset=subset, how='any')

    X, X_names = self._get_spectra(spectra_type)
    y = self.df[analytes].values
    smp_idx = self.df['id.layer_uuid_txt'].values
    ds_name = self._encode_dataset_names()

    return X, y, X_names, smp_idx, ds_name, self.ds_name_encoder.classes_

In [None]:
@patch
def _get_spectra(self:OSSLLoader, 
                    spectra_type: str # Spectra type
                    ):
    cols_ref = [name for name in self.df.columns if f'scan_{spectra_type}.' in name]
    X = self.df[cols_ref].values
    X_names = self._get_wavelengths(spectra_type)
    lower_limit, upper_limit = self.cfgs[spectra_type]['range']
    idxs = np.where((X_names >= lower_limit) & (X_names <= upper_limit))[0]
    return X[:, idxs], X_names[idxs]

In [None]:
@patch
def _encode_dataset_names(self:OSSLLoader):
    return self.ds_name_encoder.fit_transform(self.df['dataset.code_ascii_txt'])

In [None]:
@patch
def _get_wavelengths(self:OSSLLoader, 
                        spectra_type: str # Spectra type
                        ):
    pattern = r"scan_{}\.(\d+)_".format(spectra_type)
    return np.array([int(re.search(pattern, name).group(1)) for name in self.df.columns
                        if re.search(pattern, name)])


Usage example:

In [None]:
#| eval: false
loader = OSSLLoader()
analytes = 'k.ext_usda.a725_cmolc.kg'
X, y, wavenumbers, smp_idx, ds_name, ds_label = loader.load_data(analytes, 
                                                                 spectra_type='mir')

print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')
print(f'wavenumbers: {wavenumbers}')
print(f'smp_idx: {smp_idx}')
print(f'ds_name: {ds_name}')
print(f'ds_label: {ds_label}')

Loading data from /Users/franckalbinet/.lssm/data/ossl/ossl_all_L0_v1.2.csv.gz ...
X shape: (57674, 1676)
y shape: (57674, 1)
wavenumbers: [ 650  652  654 ... 3996 3998 4000]
smp_idx: ['3998362dd2659e2252cd7f38b43c9b1f' '2bab4dbbac073b8648475ad50d40eb95'
 '29213d2193232be8867d85dec463ec00' ... 'b790da349d49885c5727a2b5fd67b13d'
 'a057a7ead9eebce24d4039de7fd5e01b' '80bf4a0dc30f60552a38193d5c09b9cd']
ds_name: [0 0 0 ... 3 3 3]
ds_label: ['GARRETT.SSL' 'ICRAF.ISRIC' 'KSSL.SSL' 'LUCAS.WOODWELL.SSL']


In [None]:
#| eval: false
#| hide
# For further testing, we save the first 10 samples of the dataset
n_samples = 10
fc.save_pickle('./files/spectrum-and-all.pkl', (X[:n_samples,:], y[:n_samples], wavenumbers, smp_idx[:n_samples], 
                                                ds_name[:n_samples], ds_label[:n_samples]))

In [None]:
#| hide
# class FukushimaLoader:
# fname_fuku = '../_data/fk/Fukushimaall_Average.csv'
#| eval: false
# df_fuk = pd.read_csv(fname_fuku)
#| eval: false
# wn_cols = [col for col in df_fuk.columns if col.isdigit()]
# non_wn_cols = [col for col in df_fuk.columns if col not in wn_cols]
# print(wn_cols[:10])
# print(f'from {wn_cols[0]} to {wn_cols[-1]}')
#| eval: false
# df_fuk[non_wn_cols].columns

#| eval: false
# df_fuk[['soil_ex_K2O']]

In [None]:
#| exports
src_dir_rt = Path.home() / 'pro/data/woodwell-ringtrial/drive-download-20231013T123706Z-001'

In [None]:
#| exports
class RingtrialLoader:
    "Load Ringtrial data."
    def __init__(self, 
                 src_dir: Path = src_dir_rt, # Source directory
                 fname_mir: str = 'RT_STD_allMIRspectra_raw.csv',
                 fname_wetchem: str = 'RT_wetchem_soildata.csv',
                 target: str = 'potassium_cmolkg'
                 ): # Spectra type configuration
        fc.store_attr()
        self.ds_name_encoder = LabelEncoder()
        
    def load_mir(self):
        fname = self.src_dir / self.fname_mir
        return pd.read_csv(fname)
    
    def load_wetchem(self):
        fname = self.src_dir / self.fname_wetchem
        return pd.read_csv(fname)
    
    
    def separate_spectra_and_others(self, df_merged: pd.DataFrame) -> tuple:
        "Separate the merged dataframe into spectral data and metadata."
        spectral_cols = [col for col in df_merged.columns if col.isdigit()]
        metadata_cols = [col for col in df_merged.columns if not col.isdigit()]
        df_spectra = df_merged[spectral_cols]
        df_others = df_merged[metadata_cols]
        return df_spectra, df_others
    
    def make_idx(self, df: pd.DataFrame) -> pd.DataFrame:
        "Make a unique index for the samples."
        return (df['source'] + '-' + df['sample_id']).str.lower().str.replace('_', '-')
    
    def _encode_dataset_names(self, df: pd.DataFrame):
        return self.ds_name_encoder.fit_transform(df)

    def load_data(self) -> tuple:
        "Load Ringtrial data and return (X, y, X_names, smp_idx, ds_name, ds_label)."
        df_merged = pd.merge(self.load_mir(), 
                            self.load_wetchem().rename(columns={'\tsample_id': 'sample_id'}),
                            on='sample_id', how='inner')
        
        df_spectra, df_others = self.separate_spectra_and_others(df_merged)
        
        X = df_spectra.values
        y = df_others[self.target].values
        X_names = df_spectra.columns.astype(int).values
        smp_idx = self.make_idx(df_others).values
        ds_name = self._encode_dataset_names(df_others['organization'])
        
        return X, y, X_names, smp_idx, ds_name, self.ds_name_encoder.classes_

In [None]:
#| eval: false
X, y, X_names, smp_idx, ds_name, ds_label = RingtrialLoader().load_data()

print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')
print(f'wavenumbers: {wavenumbers}')
print(f'smp_idx: {smp_idx}')
print(f'ds_name: {ds_name}')
print(f'ds_label: {ds_label}')

X shape: (1400, 1676)
y shape: (1400,)
wavenumbers: ['650' '652' '654' ... '3996' '3998' '4000']
smp_idx: ['kssl-rt-01' 'kssl-rt-02' 'kssl-rt-03' ... 'napt-rt-68' 'napt-rt-69'
 'napt-rt-70']
ds_name: [ 0  0  0 ... 19 19 19]
ds_label: ['Agrocares' 'Argonne' 'CSU_IL' 'ETH_alpha_1' 'ETH_alpha_2' 'ETH_vertex'
 'IAEA_aug2022' 'KSSL' 'LandCare' 'Lesotho' 'MSU' 'OSU' 'Rothamsted'
 'Scion' 'UGhent' 'UIUC' 'USP' 'UWisc_fine' 'Woodwell_alpha'
 'Woodwell_vertex']
