PREDICTING ANALYTE(S) FROM ANALYTE(S)

---

### 1. Google Colab runtime setup [Optional]

In [None]:
# Clone and install spectrai package 
!git clone https://github.com/franckalbinet/spectrai.git 
!pip install /content/spectrai 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Prepare /root folder content
!cp -r /content/drive/My\ Drive/Colab\ Notebooks/data/data_spectrai /root

In [None]:
# Create configuration file
!mkdir /root/.spectrai_config & cp /content/spectrai/config.toml /root/.spectrai_config

### 2. Import packages

In [230]:
from spectrai.datasets.kssl import (get_tax_orders_lookup_tbl, load_data,
                                    load_analytes, load_data_analytes, load_fact_tbl, load_spectra)

from spectrai.vis.spectra import (plot_spectra)
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

import xgboost as xgb

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### 3.  Load KSSL dataset for analyte(s) -> analyte(s) modeling

In [162]:
# Choosing analytes to be used as features and target(s)
targets = [725]
# Selected analytes where nb_samples > 30000 (i.e most frequently measured analytes)
features = [622, 623, 624, 420, 723, 724, 722,726, 750, 481, 268, 339, 334, 342, 338, 337, 343, 340, 341, 417, 65, 66, 67]

# Tetsuya features
tetsuya_features = [383, 368]

# Selected features
features_selected = [383, 723, 481, 726, 268, 417]

In [176]:
X, X_names, y, y_names, instances_id = load_data_analytes(features=features, targets=targets)

In [177]:
print('X shape: ', X.shape)
print('X approx. memory size: {:.2f} MB'.format(X.nbytes / 10**6))
print('y approx. memory size: {:.2f} MB'.format(y.nbytes / 10**6))
print('Features: ', X_names)
print('Target variable: ', y_names)

X shape:  (20862, 23)
X approx. memory size: 3.84 MB
y approx. memory size: 0.17 MB
Features:  [622 623 624 420 723 724 722 726 750 481 268 339 334 342 338 337 343 340
 341 417  65  66  67]
Target variable:  [725]


### 4.  Modeling

* **Utilities functions**

In [178]:
def prettify_features_importance(features_importance, ascending_sort=False, label='feature_importance'):
    df_analytes = load_analytes()
    df = df_analytes.set_index('analyte_id').loc[X_names,:]
    df[label] = features_importance
    return df \
        .sort_values(by=[label], ascending=ascending_sort) \
        .reset_index()[['analyte_id', 'analyte_name', label]]

* **Assess Correlation between features and target**

In [179]:
prettify_features_importance(np.corrcoef(np.c_[X, y].T)[:, -1][:-1], label='corr. with potassium (725)')

Unnamed: 0,analyte_id,analyte_name,corr. with potassium (725)
0,723,"CEC, NH4OAc, pH 7.0, 2M KCl displacement",0.383707
1,481,"pH, 1:2 Soil-CaCl2 Suspension",0.297316
2,726,"Sodium, NH4OAc Extractable, 2M KCl displacement",0.286813
3,268,"pH, 1:1 Soil-Water Suspension",0.282177
4,417,"Water Retention, 15 Bar, <2mm, Air-dry",0.277948
5,724,"Magnesium, NH4OAc Extractable, 2M KCl displace...",0.260085
6,420,"Ratio, Air-dry/Ovendry",0.248446
7,334,Clay,0.231424
8,337,"Clay, Noncarbonate",0.221178
9,722,"Calcium, NH4OAc Extractable, 2M KCl displacement",0.212286


* **Data preparation**

In [180]:
X_subset = X
y_subset = y

In [181]:
X_subset = StandardScaler().fit_transform(X_subset)

In [182]:
# Classical train, test split
X_train, X_test, y_train, y_test = train_test_split(X_subset, y_subset, test_size=0.20, random_state=42)

print('X train shape: ', X_train.shape)
print('X test shape: ', X_test.shape)
print('y train shape: ', y_train.shape)

X train shape:  (16689, 23)
X test shape:  (4173, 23)
y train shape:  (16689, 1)


* **Linear Regression**

In [183]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [184]:
print('# of training sample: {}'.format(X_train.shape[0]))
print('Train R2 score: {:.3f}'.format(lr.score(X_train, y_train)))
print('Test R2 score: {:.3f}'.format(lr.score(X_test, y_test)))

# of training sample: 16689
Train R2 score: 0.291
Test R2 score: 0.288


In [185]:
prettify_features_importance(lr.coef_.tolist()[0])

Unnamed: 0,analyte_id,analyte_name,feature_importance
0,723,"CEC, NH4OAc, pH 7.0, 2M KCl displacement",0.391121
1,481,"pH, 1:2 Soil-CaCl2 Suspension",0.183592
2,726,"Sodium, NH4OAc Extractable, 2M KCl displacement",0.140379
3,334,Clay,0.114973
4,624,"Sulfur, Total NCS",0.069841
5,417,"Water Retention, 15 Bar, <2mm, Air-dry",0.031211
6,67,"Manganese, Dithionite Citrate Extractable",0.027836
7,750,"Volume, CEC, NH4OAc, Syringe Extract, 2M KCl d...",0.014513
8,623,"Nitrogen, Total NCS",0.006921
9,342,"Sand, Very Fine",0.004183


* **XGBoost**

In [188]:
xgb_reg = xgb.XGBRegressor(n_estimators=500)
xgb_reg.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=500, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [189]:
print('# of training sample: {}'.format(X_train.shape[0]))
print('Train R2 score: {:.3f}'.format(xgb_reg.score(X_train, y_train)))
print('Test R2 score: {:.3f}'.format(xgb_reg.score(X_test, y_test)))

# of training sample: 16689
Train R2 score: 0.996
Test R2 score: 0.566


In [190]:
prettify_features_importance(xgb_reg.feature_importances_)

Unnamed: 0,analyte_id,analyte_name,feature_importance
0,723,"CEC, NH4OAc, pH 7.0, 2M KCl displacement",0.147796
1,726,"Sodium, NH4OAc Extractable, 2M KCl displacement",0.143232
2,722,"Calcium, NH4OAc Extractable, 2M KCl displacement",0.092605
3,420,"Ratio, Air-dry/Ovendry",0.074821
4,481,"pH, 1:2 Soil-CaCl2 Suspension",0.064489
5,66,"Iron, Dithionite Citrate Extractable",0.056164
6,67,"Manganese, Dithionite Citrate Extractable",0.044407
7,65,"Aluminum, Dithionite Citrate Extractable",0.043099
8,724,"Magnesium, NH4OAc Extractable, 2M KCl displace...",0.034143
9,339,"Sand, Fine",0.0314


### 5. In-depth analysis of analytes correlation 

In [None]:
corrs = []
for analyte in tqdm(df_fact['analyte_id'].unique().tolist()):
    X, _, y, _, _ = load_data_analytes(features=[analyte], targets=[725])
    corrs.append({
        'analyte_id': analyte,
        'correlation': np.corrcoef(np.c_[X, y].T)[:, -1][:-1].item(),
        'nb_samples': X.shape[0]
    })

In [260]:
df_corrs = pd.DataFrame(corrs).dropna()

mask_not_target = df_corrs['analyte_id'] != 725
mask_min_nb = df_corrs['nb_samples'] > 30
df_corrs = df_corrs.loc[mask_not_target & mask_min_nb, :]
df_corrs['abs_correlation'] = np.abs(df_corrs['correlation'])
df_corrs = df_corrs.sort_values(by=['abs_correlation', 'nb_samples'], ascending=False)

In [261]:
df_corrs.head()

Unnamed: 0,analyte_id,correlation,nb_samples,abs_correlation
267,1065,0.920296,1657,0.920296
219,1167,0.742382,770,0.742382
339,776,-0.711802,32,0.711802
341,794,-0.670244,32,0.670244
201,839,0.625547,731,0.625547


In [271]:
df_corrs = df_corrs.merge(df_analytes[['analyte_id', 'analyte_name', 'uom_abbrev']], 
                          on='analyte_id')[['analyte_id', 'analyte_name', 'uom_abbrev', 'correlation', 'nb_samples', 'abs_correlation']]

In [308]:
df_corrs.head(25)

Unnamed: 0,analyte_id,analyte_name,uom_abbrev,correlation,nb_samples,abs_correlation
0,1065,"Potassium, Element Mehlich3 Extractable",mg/kg,0.920296,1657,0.920296
1,1167,"Potassium, Water Extractable",µg/kg,0.742382,770,0.742382
2,776,"Carbon, pom mineral",g/g,-0.711802,32,0.711802
3,794,"FMOD, Bio, whole soil",(NA),-0.670244,32,0.670244
4,839,"Phosphorus, Anion Resin Extractable, 24 hour",mg/kg,0.625547,731,0.625547
5,838,"Phosphorus, Anion Resin Extractable, 1 hour",mg/kg,0.592839,730,0.592839
6,364,"Electrical Conductivity, Predict, 1:2 (w/w)",dS/m,0.589672,31644,0.589672
7,1406,"p-nitrophenol, ß-Glucosidase",mg/kg/hr,0.584318,5783,0.584318
8,726,"Sodium, NH4OAc Extractable, 2M KCl displacement",cmol(+)/kg,0.572129,55661,0.572129
9,772,"Carbon, pom",g/g,-0.571535,32,0.571535


In [276]:
df_corrs.to_excel('data/correlations_with_potassium.xlsx', float_format="%.2f", index=False)