# Data loading

> Utilities to load Soil Spectral Libraries (SSL) compiled by [OSSL](https://soilspectroscopy.github.io/ossl-manual/) and others.

In [None]:
#| default_exp loading

In [None]:
%load_ext autoreload
%autoreload 2

In [42]:
#| export
from pathlib import Path
from tqdm import tqdm
from typing import Union, List
import re

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')

In [70]:
# | export
def load_ossl(fname: Path,  # Path to OSSL gzipped csv dump
              # Using OSSL's analytes naming conventions
              analytes: Union[str, List[str]],
              spectra_type: str,  # Possible values: 'mir', 'visnir'
              ):
    analytes = [analytes] if isinstance(analytes, str) else analytes
    df = pd.read_csv(fname, compression='infer', low_memory=True)

    scan_repr = {'visnir': 'scan_visnir.350_ref', 'mir': 'scan_mir.600_abs'}
    subset = analytes + [scan_repr[spectra_type]]
    df = df.dropna(subset=subset)

    cols_ref = [name for name in df.columns if f'scan_{spectra_type}.' in name]
    X = df[cols_ref].values

    y = df[analytes].values
    smp_idx = df['id.layer_uuid_txt'].values

    ds_name_encoder = LabelEncoder()
    ds_name = ds_name_encoder.fit_transform(df['dataset.code_ascii_txt'])
    
    pattern = r"scan_{}\.(\d+)_".format(spectra_type)
    X_names = np.array([int(re.search(pattern, name).group(1)) for name in df.columns 
                        if re.search(pattern, name)])

    return X, y, X_names, smp_idx, ds_name, ds_name_encoder.classes_


Example:

In [59]:
#|eval: false
fname_ossl = Path.home() / 'pro/data/ossl/gcs_version/ossl_all_L0_v1.2.csv.gz'
analytes = 'k.ext_usda.a725_cmolc.kg'

data = load_ossl(fname_ossl, analytes, spectra_type='visnir')
X, y, X_names, smp_idx, ds_name, ds_label = data