# Loading

> Utilities to load Soil Spectral Libraries (SSL) compiled by [OSSL](https://soilspectroscopy.github.io/ossl-manual/) and others.

In [1]:
#| default_exp loading

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
#| export
from pathlib import Path
from tqdm import tqdm
from typing import Union, List
import re
import fastdownload as fd
import fastcore.all as fc

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')

In [4]:
# | export
PATH_OSSL_ALL_L0_V1_2 = 'https://storage.googleapis.com/soilspec4gg-public/ossl_all_L0_v1.2.csv.gz'
PATH_OSSL_ALL_L1_V1_2 = 'https://storage.googleapis.com/soilspec4gg-public/ossl_all_L0_v1.2.csv.gz'

CFGS = {
        'visnir': {'ref_col': 'scan_visnir.1500_ref', 'range': [400, 2500]},   
        'mir': {'ref_col': 'scan_mir.1500_abs', 'range': [600, 4000]}
    }                

In [27]:
# | export
def download(url, dest):
    "Download `given` url into `dest` (creates it on the way if does not exist) "
    if not dest.exists(): fc.mkdir(dest, parents=True)
    return fd.download_url(url, dest)

In [20]:
# | export
def load_ossl(analytes: Union[str, List[str]], # Using OSSL's analytes naming conventions
              spectra_type: str='visnir',  # Possible values: 'mir', 'visnir'
              dest:Path=Path.home() / '.lssm/data/ossl', # directory containing the data
              ):
    "Load all available OSSL data and filter it by spectra type and analytes of interest"
    
    url = PATH_OSSL_ALL_L1_V1_2
    fname = dest / Path(PATH_OSSL_ALL_L1_V1_2).name
    if not fname.exists(): 
        print('Downloading & saving to: ', str(fname))
        download(url, dest)
    
    print('Reading & selecting data ...')
    
    df = pd.read_csv(fname, compression='infer', low_memory=True)
            
    analytes = [analytes] if isinstance(analytes, str) else analytes
        
    subset = analytes + [CFGS[spectra_type]['ref_col']]
    df = df.dropna(subset=subset)
    
    cols_ref = [name for name in df.columns if f'scan_{spectra_type}.' in name]
    X = df[cols_ref].values

    y = df[analytes].values
    smp_idx = df['id.layer_uuid_txt'].values

    ds_name_encoder = LabelEncoder()
    ds_name = ds_name_encoder.fit_transform(df['dataset.code_ascii_txt'])
    
    pattern = r"scan_{}\.(\d+)_".format(spectra_type)
    X_names = np.array([int(re.search(pattern, name).group(1)) for name in df.columns 
                        if re.search(pattern, name)])
        
    lower_limit, upper_limit = CFGS[spectra_type]['range']
    idxs = np.where((X_names >= lower_limit) & (X_names <= upper_limit))[0]

    return X[:,idxs], y, X_names[idxs], smp_idx, ds_name, ds_name_encoder.classes_


Example:

In [28]:
#|eval: false
analytes = 'k.ext_usda.a725_cmolc.kg'
data = load_ossl(analytes, spectra_type='visnir')
X, y, X_names, smp_idx, ds_name, ds_label = data

print(X.shape, y.shape)

Reading & selecting data ...
(44489, 1051) (44489, 1)
