PREDICTING WITH PARTIAL LEAST SQUARES

---




### 1. Google Colab runtime setup [Optional]

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Clone and install spectrai package 
!git clone https://github.com/franckalbinet/spectrai.git 
!pip install /content/spectrai 

In [None]:
# Prepare /root folder content
!cp -r /content/drive/My\ Drive/Colab\ Notebooks/data/data_spectrai /root

In [None]:
# Create configuration file
!mkdir /root/.spectrai_config & cp /content/spectrai/config.toml /root/.spectrai_config

### 2. Import packages

In [None]:
from spectrai.datasets.kssl import (get_tax_orders_lookup_tbl, get_analytes, load_data)
from spectrai.vis.spectra import plot_spectra
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.model_selection import train_test_split
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import cross_val_predict
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

import numpy as np

from scipy.signal import savgol_filter

import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### 3. Load KSSL dataset

In [None]:
# Loading Potassium, NH4OAc
X, X_names, y, y_names, instances_id = load_data([725])

In [None]:
print('X shape: ', X.shape)
print('X approx. memory size: {} MB'.format(X.nbytes // 10**6))
print('y approx. memory size: {} MB'.format(y.nbytes // 10**6))
print('Wavenumbers: ', X_names)
print('Target variable: ', y_names)

X shape:  (50714, 1764)
X approx. memory size: 357 MB
y approx. memory size: 1 MB
Wavenumbers:  [3999 3997 3995 ...  603  601  599]
Target variable:  ['lay_depth_to_top' 'order_id' 'calc_value']


### 4. Data preparation and preprocessing

In [None]:
# Display taxonomic orders
get_tax_orders_lookup_tbl()

{'alfisols': 0,
 'andisols': 7,
 'aridisols': 11,
 'entisols': 3,
 'gelisols': 12,
 'histosols': 8,
 'inceptisols': 2,
 'mollisols': 1,
 nan: 5,
 'oxisols': 9,
 'spodosols': 4,
 'ultisols': 6,
 'vertisols': 10}

In [None]:
# Keeping data with analyte concentration > 0 only and for 'alfisols' taxonomic order only.
TAX_ORDER_ID = 0

idx_y_valid = y[:, -1] > 0
idx_order = y[:,1] == TAX_ORDER_ID
idx = idx_y_valid & idx_order

X_subset = X[idx,:]
y_subset = y[idx,:]

In [None]:
# Creating train, valid, test sets
X_train, X_test, y_train, y_test = train_test_split(X_subset, y_subset[:, -1], test_size=0.40, random_state=42)

print('X train shape: ', X_train.shape)
print('X test shape: ', X_test.shape)
print('y train shape: ', y_train.shape)
print('y test shape: ', y_test.shape)

X train shape:  (2982, 1764)
X test shape:  (1988, 1764)
y train shape:  (2982,)
y test shape:  (1988,)


In [None]:
# Custom transformer (To be able to "search grid")
class TakeDerivative(BaseEstimator, TransformerMixin):
    def __init__(self, window_length=11, polyorder=1, deriv=1):
        self.window_length = window_length
        self.polyorder = polyorder
        self.deriv = deriv
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return savgol_filter(X, self.window_length, self.polyorder, self.deriv)

### 5. Fit  and fine-tune PLS model 

In [None]:
# Set grid of hyper-parameters values to explore
param_grid = {'deriv__window_length': range(3, 13, 2), # Should be odd
              'deriv__polyorder': range(1, 2),
              'model__n_components': range(2,20)}

In [None]:
# Setup and fit the pipeline
pipe = Pipeline([('deriv', TakeDerivative()), ('model', PLSRegression())])
grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='r2', return_train_score=True, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed:  4.3min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('deriv',
                                        TakeDerivative(deriv=1, polyorder=1,
                                                       window_length=11)),
                                       ('model',
                                        PLSRegression(copy=True, max_iter=500,
                                                      n_components=2,
                                                      scale=True, tol=1e-06))],
                                verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'deriv__polyorder': range(1, 2),
                         'deriv__window_length': range(3, 13, 2),
                         'model__n_components': range(2, 20)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='r2', verbose=1)

In [None]:
# What is the "best" combination of hyper-parameters
grid_search.best_params_

{'deriv__polyorder': 1, 'deriv__window_length': 3, 'model__n_components': 16}

In [None]:
# What is the "best" score
grid_search.best_score_

0.4461821015526968

In [None]:
# Listing all grid points and associated score
cvres = grid_search.cv_results_
for mean_train_score, mean_test_score, params in zip(cvres['mean_train_score'], cvres['mean_test_score'], cvres['params']):
  print(mean_train_score, mean_test_score,params)

0.3167399417375815 0.2962602693684806 {'deriv__polyorder': 1, 'deriv__window_length': 3, 'model__n_components': 2}
0.3514656105952848 0.3223630053567927 {'deriv__polyorder': 1, 'deriv__window_length': 3, 'model__n_components': 3}
0.3762522338046416 0.33655192123263894 {'deriv__polyorder': 1, 'deriv__window_length': 3, 'model__n_components': 4}
0.39764917785284376 0.350928286169803 {'deriv__polyorder': 1, 'deriv__window_length': 3, 'model__n_components': 5}
0.4246270178548553 0.3763031554323237 {'deriv__polyorder': 1, 'deriv__window_length': 3, 'model__n_components': 6}
0.4361146054758981 0.3768784586969511 {'deriv__polyorder': 1, 'deriv__window_length': 3, 'model__n_components': 7}
0.4642720900229068 0.39837036417053223 {'deriv__polyorder': 1, 'deriv__window_length': 3, 'model__n_components': 8}
0.4818274218555431 0.40651481024697644 {'deriv__polyorder': 1, 'deriv__window_length': 3, 'model__n_components': 9}
0.5015772664685291 0.41892688982108256 {'deriv__polyorder': 1, 'deriv__window

In [None]:
print('R2 on traint set with best estimator: ', grid_search.best_estimator_.score(X_train, y_train))
print('R2 on test set with best estimator: ', grid_search.best_estimator_.score(X_test, y_test))

R2 on traint set with best estimator:  0.5543724075952687
R2 on test set with best estimator:  0.41495689652680884
