In [None]:
#|default_exp data.loading

# Loading

> Utility function to load MIRS spectra (target) and wet chemistry

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#|export
from pathlib import Path
import pandas as pd
import pickle
import numpy as np
from fastcore.test import *
from typing import List

pd.set_option('display.max_rows', 500)

In [None]:
#|export
def get_spectra(fname:str, # folder path containing data
                 org_name:str='organization', # column name containing name of the organization
                 id_name:str='sample_id',# column name containing name of the organization
                 wn_colrange:List[int]=range(2,1703), # column indices of wavenumbers
                ):
    "Loads spectra (features) & associated wavenumbers, organization name, sample ids"
    df = pd.read_csv(fname)                 
    orgs, smp_ids, wns, spectra = [d.to_numpy() for d in [df[org_name], 
                                                          df[id_name], 
                                                          df.columns[wn_colrange].astype(int), 
                                                          df.iloc[:, range(2,1703)]]]
    return orgs, smp_ids, wns, spectra, df

In [None]:
fname = './files/RT_STD_allMIRspectra_raw.csv'
orgs, smp_ids, wns, spectra, df = get_spectra(fname)

In [None]:
df.head()

Unnamed: 0,organization,sample_id,600,602,604,606,608,610,612,614,...,3982,3984,3986,3988,3990,3992,3994,3996,3998,4000
0,Agrocares,RT_01,2.14711,2.13471,2.12274,2.11369,2.10487,2.09844,2.09177,2.08036,...,1.09917,1.09898,1.09877,1.09851,1.09827,1.09811,1.09797,1.09797,1.09797,1.09797
1,Agrocares,RT_02,2.32404,2.33516,2.34558,2.35114,2.35641,2.35855,2.36069,2.36288,...,1.24065,1.24042,1.24015,1.23977,1.23938,1.23896,1.23861,1.23861,1.23861,1.23861
2,Agrocares,RT_03,2.50423,2.50755,2.50973,2.50403,2.49913,2.50261,2.50697,2.5291,...,1.25036,1.24959,1.24877,1.24777,1.24679,1.24585,1.24505,1.24505,1.24505,1.24505
3,Agrocares,RT_04,2.30902,2.29788,2.28667,2.2749,2.26393,2.26128,2.25905,2.26521,...,1.15192,1.15176,1.15158,1.15132,1.15106,1.1508,1.15058,1.15058,1.15058,1.15058
4,Agrocares,RT_05,2.28727,2.28852,2.29013,2.29434,2.29856,2.30286,2.307,2.30782,...,1.16109,1.16106,1.16102,1.16095,1.16086,1.1607,1.16056,1.16056,1.16056,1.16056


In [None]:
print(f'orgs: {orgs}')
print(f'smp_ids: {smp_ids}')
print(f'wns: {wns}')
print(f'spectra shape: {spectra.shape}')

orgs: ['Agrocares' 'Agrocares' 'Agrocares' ... 'Woodwell_vertex'
 'Woodwell_vertex' 'Woodwell_vertex']
smp_ids: ['RT_01' 'RT_02' 'RT_03' ... 'RT_68' 'RT_69' 'RT_70']
wns: [ 600  602  604 ... 3996 3998 4000]
spectra shape: (1118, 1701)


In [None]:
#|export
def get_wetchem(fname:str, # folder path containing data
                analyte:str='clay_perc', # analyte name
                dropna:bool=True # whether or not you want to get rid of NaN values
                ):
    "Loads wet chemistry (target)"
    df = pd.read_csv(fname)
    df.fillna(df.iloc[:, 2:].mean(), inplace=True)
    df_select = df[analyte]
    y, y_napt = df_select[:60], df_select[60:]
    return y.to_numpy(), y_napt.to_numpy(), df

In [None]:
fname = './files/RT_wetchem_soildata.csv'
y, y_napt, df = get_wetchem(fname)

In [None]:
df.shape

(70, 8)

In [None]:
y

array([ 6.28199   ,  4.44442   , 42.19991   , 17.10475   , 17.22985   ,
       18.20389   , 17.07796   , 27.46067   , 12.91032   , 21.88833   ,
       17.47756   , 41.04345   , 16.95715   ,  5.1238    , 15.98527   ,
       21.40597   , 21.31223   , 21.5113    , 21.0915    , 38.90257   ,
        2.19527   , 27.65285   ,  0.92417   , 12.71177   , 12.99189   ,
       27.1811    , 26.85221   , 21.51316   , 21.31897   , 21.2085    ,
       20.96345   , 39.79006   , 43.33046   , 24.97947   , 16.73154   ,
       12.85499   , 13.04752   , 27.75883   , 36.70255   , 16.00733   ,
       15.95912   , 22.71649246, 33.13734   ,  0.        , 39.87246   ,
       41.92131   , 39.77298   , 30.38773   , 32.40552   , 17.04644   ,
       18.67273   , 34.20548   , 39.75466   , 35.47979   , 43.45771   ,
       17.35687   , 18.76691   , 29.71495   , 31.81229   , 15.38471   ])

In [None]:
y_napt

array([15.2,  4.1, 26.9,  3.6, 19.1, 25. , 28.4, 20. , 49.3,  8.4])