PREDICTING WITH PARTIAL LEAST SQUARES

---




### 1. Google Colab runtime setup [Optional]

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Clone and install spectrai package 
!git clone https://github.com/franckalbinet/spectrai.git 
!pip install /content/spectrai 

In [None]:
# Prepare /root folder content
!cp -r /content/drive/My\ Drive/Colab\ Notebooks/data/data_spectrai /root

In [None]:
# Create configuration file
!mkdir /root/.spectrai_config & cp /content/spectrai/config.toml /root/.spectrai_config

### 2. Import packages

In [19]:
from spectrai.datasets.kssl import (get_tax_orders_lookup_tbl, get_analytes, load_data)
from spectrai.vis.spectra import plot_spectra
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from tqdm import tqdm

from scipy.signal import savgol_filter

import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### 3. Load KSSL dataset

In [None]:
# Loading Potassium, NH4OAc
X, X_names, y, y_names, instances_id = load_data([725])

In [None]:
print('X shape: ', X.shape)
print('X approx. memory size: {} MB'.format(X.nbytes // 10**6))
print('y approx. memory size: {} MB'.format(y.nbytes // 10**6))
print('Wavenumbers: ', X_names)
print('Target variable: ', y_names)

X shape:  (50714, 1764)
X approx. memory size: 357 MB
y approx. memory size: 1 MB
Wavenumbers:  [3999 3997 3995 ...  603  601  599]
Target variable:  ['lay_depth_to_top' 'order_id' 'calc_value']


### 4. Data preparation and preprocessing

In [None]:
# Display taxonomic orders
get_tax_orders_lookup_tbl()

{'alfisols': 0,
 'andisols': 7,
 'aridisols': 11,
 'entisols': 3,
 'gelisols': 12,
 'histosols': 8,
 'inceptisols': 2,
 'mollisols': 1,
 nan: 5,
 'oxisols': 9,
 'spodosols': 4,
 'ultisols': 6,
 'vertisols': 10}

In [None]:
# Keeping data with analyte concentration > 0 only and for 'inceptisols' taxonomic order only.
TAX_ORDER_ID = 2

idx_y_valid = y[:, -1] > 0
idx_order = y[:,1] == TAX_ORDER_ID
idx = idx_y_valid & idx_order

X_subset = X[idx,:]
y_subset = y[idx,:]

### 5. Fit  and fine-tune PLS model 

In [None]:
# Creating train, valid, test sets
X_train, X_test, y_train, y_test = train_test_split(X_subset, y_subset[:, -1], test_size=0.30, random_state=42)

print('X train shape: ', X_train.shape)
print('X test shape: ', X_test.shape)
print('y train shape: ', y_train.shape)
print('y test shape: ', y_test.shape)

X train shape:  (2860, 1764)
X test shape:  (1226, 1764)
y train shape:  (2860,)
y test shape:  (1226,)


In [None]:
# Set grid of hyper-parameters values to explore
param_grid = {'n_estimators': range(2, 130, 30),
              'max_depth': range(4, 6),
              'max_leaf_nodes': range(5, 20, 5)}

In [15]:
# Setup and fit the pipeline
grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=5, scoring='r2', return_train_score=True)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': range(4, 6),
     

In [16]:
# What is the "best" combination of hyper-parameters
grid_search.best_params_

{'max_depth': 5, 'max_leaf_nodes': 15, 'n_estimators': 62}

In [18]:
# What is the "best" score
grid_search.best_score_

0.4185448992654145

In [17]:
print('R2 on traint set with best estimator: ', grid_search.best_estimator_.score(X_train, y_train))
print('R2 on test set with best estimator: ', grid_search.best_estimator_.score(X_test, y_test))

R2 on traint set with best estimator:  0.6215015015363958
R2 on test set with best estimator:  0.39933953003983635
