# Loading

> Utilities to load ground truth (exchangeable potassium) in soil from [OSSL](https://soilspectroscopy.github.io/ossl-manual/) and others.

In [None]:
#| default_exp loading

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#| export
from pathlib import Path
from tqdm import tqdm
from typing import Union, List
import re
import fastdownload as fd
import fastcore.all as fc

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')

In [None]:
# | export
PATH_OSSL_ALL_L0_V1_2 = 'https://storage.googleapis.com/soilspec4gg-public/ossl_all_L0_v1.2.csv.gz'
PATH_OSSL_ALL_L1_V1_2 = 'https://storage.googleapis.com/soilspec4gg-public/ossl_all_L0_v1.2.csv.gz'

In [None]:
# | export
def download(url, dest):
    "Download `given` url into `dest` (creates it on the way if does not exist) "
    if not dest.exists(): fc.mkdir(dest, parents=True)
    return fd.download_url(url, dest)

In [None]:
# | export
def load_ossl(analytes: Union[str, List[str]], # Using OSSL's analytes naming conventions
              dest:Path=Path.home() / '.geka/data/ossl', # directory containing the data
              ):
    "Load all available OSSL data and filter it by analytes of interest"
    
    url = PATH_OSSL_ALL_L1_V1_2
    fname = dest / Path(PATH_OSSL_ALL_L1_V1_2).name
    if not fname.exists(): 
        print('Downloading & saving to: ', str(fname))
        download(url, dest)
    
    print('Reading & selecting data ...')
    
    df = pd.read_csv(fname, compression='infer', low_memory=True)
            
    analytes = [analytes] if isinstance(analytes, str) else analytes
    cols_not_nan = ['longitude.point_wgs84_dd', 'latitude.point_wgs84_dd', 'observation.date.begin_iso.8601_yyyy.mm.dd']
    
    df = df.dropna(subset=analytes+cols_not_nan)
    
    metadata_of_interest = ['dataset.code_ascii_txt', 'id.layer_local_c', 'id.project_ascii_txt', 
                            'layer.upper.depth_usda_cm', 'layer.lower.depth_usda_cm']
    
    return df[metadata_of_interest + cols_not_nan + analytes]

Example:

In [None]:
#|eval: false
analytes = 'k.ext_usda.a725_cmolc.kg'
df = load_ossl(analytes)

print('df shape: ', df.shape)

Reading & selecting data ...
df shape:  (78804, 9)


In [None]:
#|eval: false
df.head()

Unnamed: 0,dataset.code_ascii_txt,id.layer_local_c,id.project_ascii_txt,layer.upper.depth_usda_cm,layer.lower.depth_usda_cm,longitude.point_wgs84_dd,latitude.point_wgs84_dd,observation.date.begin_iso.8601_yyyy.mm.dd,k.ext_usda.a725_cmolc.kg
3633,GARRETT.SSL,S40857,Forest soil data from New Zealand (Scion Resea...,0.0,10.0,174.42,-36.78,2000-06-19,0.200688
3634,GARRETT.SSL,S40858,Forest soil data from New Zealand (Scion Resea...,10.0,20.0,174.42,-36.78,2000-06-19,0.08626
3635,GARRETT.SSL,S40859,Forest soil data from New Zealand (Scion Resea...,20.0,38.0,174.42,-36.78,2000-06-19,0.094081
3636,GARRETT.SSL,S40860,Forest soil data from New Zealand (Scion Resea...,38.0,90.0,174.42,-36.78,2000-06-19,0.144572
3637,GARRETT.SSL,S40861,Forest soil data from New Zealand (Scion Resea...,0.0,10.0,174.55,-36.67,2000-06-20,0.310516
