# OSSL datasets

> Data loading for the [OSSL dataset](https://explorer.soilspectroscopy.org/)

In [None]:
#| default_exp datasets.ossl

In [None]:
#| exports
from fastcore.all import *
from pathlib import Path
import pandas as pd
from urllib.request import urlretrieve
import re
import numpy as np

from soilspecdata.types import *

In [None]:
#| exports
class OSSLData:
    def __init__(self, df):
        self.df = df
        self._parse_columns()        
        self.sample_ids = (df['id.layer_local_c'].values if 'id.layer_local_c' 
                           in df.columns else np.arange(len(df)))

In [None]:
#| exports
@patch
def _parse_columns(self:OSSLData):
    "Parse columns into visnir, mir and properties"
    self.visnir_cols = [c for c in self.df.columns if re.match(r'scan_visnir\.\d+_ref', c)]
    self.mir_cols = [c for c in self.df.columns if re.match(r'scan_mir\.\d+_abs', c)]
    spectral_cols = set(self.visnir_cols + self.mir_cols)
    self.properties_cols = [c for c in self.df.columns if c not in spectral_cols]

In [None]:
#| exports
def get_cache_path(): return Path.home()/'.soilspecdata'

In [None]:
#| exports
def get_ossl(url='https://storage.googleapis.com/soilspec4gg-public/ossl_all_L0_v1.2.csv.gz', # OSSL URL
             force_download=False # if True, force download
             ):
    "Get OSSL data"
    cache_path = get_cache_path()/'ossl_v1.2.csv.gz'
    if not cache_path.exists() or force_download:
        cache_path.parent.mkdir(exist_ok=True)
        urlretrieve(url, cache_path)
        
     # Define date columns
    date_columns = [
        'scan.mir.date.begin_iso.8601_yyyy.mm.dd',
        'scan.mir.date.end_iso.8601_yyyy.mm.dd',
        'scan.visnir.date.begin_iso.8601_yyyy.mm.dd',
        'scan.visnir.date.end_iso.8601_yyyy.mm.dd'
    ]
    
    # Update dtype dictionary without datetime columns
    dtype = {
        # IDs and codes
        'id.layer_local_c': 'string',
        'id.location_olc_txt': 'string',
        'id.dataset.site_ascii_txt': 'string',
        'id.scan_local_c': 'string',
        
        # Categorical text fields
        'layer.texture_usda_txt': 'category',
        'pedon.taxa_usda_txt': 'category',
        'horizon.designation_usda_txt': 'category',
        'location.country_iso.3166_txt': 'category',
        'surveyor.address_utf8_txt': 'category',
        'efferv_usda.a479_class': 'category',
        
        # Text fields
        'scan.mir.model.name_utf8_txt': 'string',
        'scan.mir.model.code_any_txt': 'string',
        'scan.mir.method.optics_any_txt': 'string',
        'scan.mir.method.preparation_any_txt': 'string',
        'scan.mir.license.title_ascii_txt': 'string',
        'scan.mir.license.address_idn_url': 'string',
        'scan.mir.doi_idf_url': 'string',
        'scan.mir.contact.name_utf8_txt': 'string',
        'scan.mir.contact.email_ietf_txt': 'string',
        'scan.visnir.model.name_utf8_txt': 'string',
        'scan.visnir.model.code_any_txt': 'string',
        'scan.visnir.method.optics_any_txt': 'string',
        'scan.visnir.method.preparation_any_txt': 'string',
        'scan.visnir.license.title_ascii_txt': 'string',
        'scan.visnir.license.address_idn_url': 'string',
        'scan.visnir.doi_idf_url': 'string',
        'scan.visnir.contact.name_utf8_txt': 'string',
        'scan.visnir.contact.email_ietf_txt': 'string'
    }
    df = pd.read_csv(cache_path, compression='gzip', dtype=dtype,
                     parse_dates=date_columns)
    return OSSLData(df)

In [None]:
#| eval: false
ossl = get_ossl(force_download=False)

In [None]:
#| eval: false
ossl.visnir_cols[:2], ossl.mir_cols[:2], ossl.properties_cols[:2]

(['scan_visnir.350_ref', 'scan_visnir.352_ref'],
 ['scan_mir.600_abs', 'scan_mir.602_abs'],
 ['dataset.code_ascii_txt', 'id.layer_local_c'])

In [None]:
#| exports
@patch
def _get_valid_spectra_mask(self:OSSLData, spectra_cols):
    """Return mask for samples with all non-null values in spectra"""
    return self.df[spectra_cols].notna().all(axis=1)

In [None]:
#| eval: false
ossl._get_valid_spectra_mask(ossl.mir_cols).sum()

np.int64(85684)

In [None]:
#| exports
@patch
def _extract_wavenumbers(self:OSSLData, 
                        cols: List[str] # column names
                        ):
    "Extract wavenumbers from column names"
    return np.array([int(re.search(r'\.(\d+)_', c).group(1)) for c in cols])

In [None]:
#| eval: false
ossl._extract_wavenumbers(ossl.visnir_cols)

array([ 350,  352,  354, ..., 2496, 2498, 2500], shape=(1076,))

In [None]:
#| exports
@patch
def _extract_measurement_type(self:OSSLData, 
                              cols: List[str] # column names
                              ):
    "Extract measurement type from column names"
    types = set(re.search(r'_(\w+)$', c).group(1) for c in cols)
    assert len(types) == 1, f"Mixed measurement types found: {types}"
    return types.pop()

In [None]:
#| eval: false
ossl._extract_measurement_type(ossl.visnir_cols), ossl._extract_measurement_type(ossl.mir_cols)

('ref', 'abs')

In [None]:
#| exports
@patch
def _filter_wavelength_range(self:OSSLData, 
                             wavenumbers: np.ndarray, # wavenumbers
                             spectra: np.ndarray, # spectra
                             cols: List[str], # column names
                             wmin: Optional[float]=None, # min wavenumber
                             wmax: Optional[float]=None # max wavenumber
                             ):
    "Filter spectra based on wavelength range"
    mask = np.ones(len(wavenumbers), dtype=bool)
    if wmin is not None:
        mask &= wavenumbers >= wmin
    if wmax is not None:
        mask &= wavenumbers <= wmax
    return wavenumbers[mask], spectra[:, mask], [cols[i] for i in np.where(mask)[0]]

In [None]:
#| eval: false
wavenumbers, spectra, cols = ossl._filter_wavelength_range(
    ossl._extract_wavenumbers(ossl.visnir_cols), 
    ossl.df[ossl.visnir_cols].values, 
    ossl.visnir_cols, 
    wmin=500, wmax=1000
)
wavenumbers.min(), wavenumbers.max(), spectra.shape, cols[:2]

(np.int64(500),
 np.int64(1000),
 (135651, 251),
 ['scan_visnir.500_ref', 'scan_visnir.502_ref'])

In [None]:
#| eval: false
ossl.visnir_cols[0], ossl.visnir_cols[-1]

('scan_visnir.350_ref', 'scan_visnir.2500_ref')

In [None]:
#| exports
@patch 
def get_visnir(self:OSSLData, 
               wmin: Optional[float]=None, # min wavenumber
               wmax: Optional[float]=None, # max wavenumber
               require_valid: bool=True # if True, only return samples with no null values
               ):
    "Get VISNIR spectra within specified wavenumber range"
    wavenumbers = self._extract_wavenumbers(self.visnir_cols)
    spectra = self.df[self.visnir_cols].values
    wavenumbers, _, filtered_cols = self._filter_wavelength_range(
        wavenumbers, spectra, self.visnir_cols, wmin, wmax
    )
    
    if require_valid:
        valid_mask = self._get_valid_spectra_mask(filtered_cols)
        df_subset = self.df[valid_mask]
        sample_ids = self.sample_ids[valid_mask]
    else:
        df_subset = self.df
        sample_ids = self.sample_ids
        
    spectra = df_subset[filtered_cols].values
    measurement_type = self._extract_measurement_type(filtered_cols)
    return SpectraData(wavenumbers, spectra, measurement_type, sample_ids)

In [None]:
#| eval: false
visnir_data = ossl.get_visnir(wmin=500, wmax=1000, require_valid=True)
visnir_data.spectra.shape

(64644, 251)

In [None]:
#| exports
@patch 
def get_mir(self:OSSLData, 
            wmin: Optional[float]=600, # min wavenumber
            wmax: Optional[float]=4000, # max wavenumber
            require_valid: bool=True # if True, only return samples with no null values
            ):
    "Get MIR spectra within specified wavenumber range"
    wavenumbers = self._extract_wavenumbers(self.mir_cols)
    spectra = self.df[self.mir_cols].values
    wavenumbers, _, filtered_cols = self._filter_wavelength_range(
        wavenumbers, spectra, self.mir_cols, wmin, wmax
    )
    
    if require_valid:
        valid_mask = self._get_valid_spectra_mask(filtered_cols)
        df_subset = self.df[valid_mask]
        sample_ids = self.sample_ids[valid_mask]
    else:
        df_subset = self.df
        sample_ids = self.sample_ids
        
    spectra = df_subset[filtered_cols].values
    measurement_type = self._extract_measurement_type(filtered_cols)
    
    return SpectraData(wavenumbers, spectra, measurement_type, sample_ids)

In [None]:
#| eval: false
mir_data = ossl.get_mir(require_valid=True)
mir_data.spectra.shape, mir_data.wavenumbers.min(), mir_data.wavenumbers.max()

((85684, 1701), np.int64(600), np.int64(4000))

In [None]:
#| exports
@patch
def get_properties(self:OSSLData, 
                   properties=None, # properties
                   require_complete: bool=False # if True, only return samples with no null values
                   ):
    "Get properties data with sample IDs"
    if properties is None:
        properties = self.properties_cols
    elif isinstance(properties, str):
        properties = [properties]
            
    df_subset = pd.DataFrame({
        'id': self.sample_ids,
        **{col: self.df[col] for col in properties}
    }).set_index('id')
        
    if require_complete:
        return df_subset.dropna()
    return df_subset

Get only complete MIR spectra:

In [None]:
#| eval: false
ossl = get_ossl()
mir_data = ossl.get_mir(require_valid=True)

Get properties needed as ML targets (must be complete):

In [None]:
#| eval: false
targets = ossl.get_properties(['cec_usda.a723_cmolc.kg'], require_complete=True)
targets.shape, targets.head()

((57064, 1),
         cec_usda.a723_cmolc.kg
 id                            
 S40857                6.633217
 S40858                3.822628
 S40859                3.427324
 S40860                1.906545
 S40861               13.403203)

Get optional metadata (can have nulls):

In [None]:
#| eval: false
metadata = ossl.get_properties(['longitude.point_wgs84_dd', 'latitude.point_wgs84_dd'], require_complete=False)
metadata.shape, metadata.head()

((135651, 2),
            longitude.point_wgs84_dd  latitude.point_wgs84_dd
 id                                                          
 icr072246                 15.687492                -7.377750
 icr072247                 15.687492                -7.377750
 icr072266                 15.687817                -7.351243
 icr072267                 15.687817                -7.351243
 icr072286                 15.687965                -7.331673)

In [None]:
#| exports
@patch
def get_aligned_data(self:OSSLData, 
                    spectra_data: SpectraData, # spectra data
                    target_cols: Union[str, List[str]] # target columns
                    ): 
    "Get aligned spectra and target data for ML, along with their sample IDs."
    # Get targets with complete data
    targets = self.get_properties(target_cols, require_complete=True)
    
    # Find common IDs between spectra and targets
    common_ids = list(set(spectra_data.sample_ids) & set(targets.index))
    
    # Create index mapping for efficient lookup
    spectra_id_to_idx = {id_: idx for idx, id_ in enumerate(spectra_data.sample_ids)}
    
    # Get indices for alignment
    indices = [spectra_id_to_idx[id_] for id_ in common_ids]
    
    # Align the data
    features = spectra_data.spectra[indices]
    targets = targets.loc[common_ids].values
    sample_ids = np.array(common_ids)
    
    return features, targets, sample_ids

In [None]:
#| eval: false
def test_get_aligned_data():
    # Create a small test DataFrame
    test_df = pd.DataFrame({
        'id.layer_local_c': ['A1', 'A2', 'A3', 'A4'],
        'cec_usda.a723_cmolc.kg': [1.0, 2.0, np.nan, 4.0],
        'scan_mir.600_abs': [0.1, 0.2, 0.3, 0.4],
        'scan_mir.700_abs': [1.1, 1.2, 1.3, 1.4]
    })
    
    # Create OSSLData instance
    ossl = OSSLData(test_df)
    
    # Get MIR spectra
    mir_data = ossl.get_mir(require_valid=True)
    
    # Get aligned data
    features, targets, ids = ossl.get_aligned_data(
        spectra_data=mir_data,
        target_cols='cec_usda.a723_cmolc.kg'
    )
    
    # Tests
    assert features.shape == (3, 2), f"Expected shape (3, 2), got {features.shape}"
    assert targets.shape == (3, 1), f"Expected shape (3, 1), got {targets.shape}"
    assert len(ids) == 3, f"Expected 3 ids, got {len(ids)}"
    
    # Check if A3 (with NaN target) is excluded
    assert 'A3' not in ids, "Sample with NaN target should be excluded"
    
    # Check if values are correctly aligned
    expected_targets = np.array([[1.0], [2.0], [4.0]])
    np.testing.assert_array_almost_equal(targets, expected_targets)
    
    expected_features = np.array([
        [0.1, 1.1],
        [0.2, 1.2],
        [0.4, 1.4]
    ])
    np.testing.assert_array_almost_equal(features, expected_features)
    
    # Check if IDs match
    expected_ids = np.array(['A1', 'A2', 'A4'])
    np.testing.assert_array_equal(ids, expected_ids)

test_get_aligned_data()

How to use?

In [None]:
#| eval: false
X, y, ids = ossl.get_aligned_data(
    spectra_data=mir_data,
    target_cols='cec_usda.a723_cmolc.kg'
)

X.shape, y.shape, ids.shape

((57062, 1701), (57062, 1), (57062,))

In [None]:
#| eval: false
X[:5], y[:5], ids[:5]


(array([[1.72552, 1.74202, 1.75742, ..., 0.3168 , 0.316  , 0.31515],
        [1.56018, 1.55495, 1.55105, ..., 0.11978, 0.11947, 0.11925],
        [1.56395, 1.566  , 1.56942, ..., 0.22598, 0.22558, 0.22518],
        [1.59468, 1.5963 , 1.59908, ..., 0.16742, 0.16718, 0.16685],
        [1.406  , 1.40578, 1.407  , ..., 0.09735, 0.09722, 0.09702]],
       shape=(5, 1701)),
 array([[36.05333],
        [ 7.29322],
        [14.60657],
        [30.39838],
        [10.71835]]),
 array(['11762', '197528', '67123', '178714', '168503'], dtype='<U12'))

Later, if you need metadata for these samples:

In [None]:
#| eval: false
metadata = ossl.get_properties(['longitude.point_wgs84_dd', 'latitude.point_wgs84_dd']).loc[ids]
metadata.head()

Unnamed: 0_level_0,longitude.point_wgs84_dd,latitude.point_wgs84_dd
id,Unnamed: 1_level_1,Unnamed: 2_level_1
11762,-123.096299,45.331855
197528,-91.477896,36.915254
67123,-99.179787,31.150749
178714,,
168503,,
