PREDICTING WITH PARTIAL LEAST SQUARES AND WAVELETS SCATTERING

---



### 1. Google Colab runtime setup [Optional]

In [38]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Clone and install spectrai package 
!git clone https://github.com/franckalbinet/spectrai.git 
!pip install /content/spectrai 

In [None]:
# Prepare /root folder content
!cp -r /content/drive/My\ Drive/Colab\ Notebooks/data/data_spectrai /root

In [None]:
# Create configuration file
!mkdir /root/.spectrai_config & cp /content/spectrai/config.toml /root/.spectrai_config

### 2. Import packages

In [400]:
from spectrai.datasets.kssl import (get_tax_orders_lookup_tbl, get_analytes, load_data)
from spectrai.vis.spectra import plot_spectra
from spectrai.features.preprocessing import TakeDerivative
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.model_selection import train_test_split
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import cross_val_predict

from sklearn.preprocessing import MinMaxScaler

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA

#from kymatio.sklearn import Scattering1D
from kymatio.numpy import Scattering1D

import numpy as np

import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [132]:
?Scattering1D

### 3. Load KSSL dataset

In [6]:
# Loading Potassium, NH4OAc
X, X_names, y, y_names, instances_id = load_data([725])

In [7]:
print('X shape: ', X.shape)
print('X approx. memory size: {} MB'.format(X.nbytes // 10**6))
print('y approx. memory size: {} MB'.format(y.nbytes // 10**6))
print('Wavenumbers: ', X_names)
print('Target variable: ', y_names)

X shape:  (50714, 1764)
X approx. memory size: 357 MB
y approx. memory size: 1 MB
Wavenumbers:  [3999 3997 3995 ...  603  601  599]
Target variable:  ['lay_depth_to_top' 'order_id' 'calc_value']


### 4. Data preparation and preprocessing

In [8]:
# Display taxonomic orders
get_tax_orders_lookup_tbl()

{'alfisols': 0,
 'mollisols': 1,
 'inceptisols': 2,
 'entisols': 3,
 'spodosols': 4,
 nan: 5,
 'ultisols': 6,
 'andisols': 7,
 'histosols': 8,
 'oxisols': 9,
 'vertisols': 10,
 'aridisols': 11,
 'gelisols': 12}

In [9]:
# Keeping data with analyte concentration > 0 only and for 'alfisols' taxonomic order only.
TAX_ORDER_ID = 0

idx_y_valid = y[:, -1] > 0
idx_order = y[:,1] == TAX_ORDER_ID
idx = idx_y_valid & idx_order

X_subset = X[idx,:]
y_subset = y[idx,:]

In [10]:
# Creating train, valid, test sets
X_train, X_test, y_train, y_test = train_test_split(X_subset, y_subset[:, -1], test_size=0.40, random_state=42)

print('X train shape: ', X_train.shape)
print('X test shape: ', X_test.shape)
print('y train shape: ', y_train.shape)
print('y test shape: ', y_test.shape)

X train shape:  (2982, 1764)
X test shape:  (1988, 1764)
y train shape:  (2982,)
y test shape:  (1988,)


### 5. Fit  and fine-tune PLS model 

In [404]:
from sklearn.base import BaseEstimator, TransformerMixin

class FlattenScattering(BaseEstimator, TransformerMixin):
    def __init__(self, order0=False):
        self.order0 = order0

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        batch_size = X.shape[0]
        return X[:, 1:, :].reshape(batch_size, -1)

    
class PruneScattering(BaseEstimator, TransformerMixin):
    def __init__(self, q_features=90, q_keep=50):
        self.q_features = q_features
        self.q_keep = q_keep

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        qs = np.percentile(X, self.q_features, axis=0)
        idx = (qs >= np.percentile(qs, self.q_keep))
        return X[:, idx]
    
    
class Scattering1DTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, shape, J=4, Q=12):
        self.shape = shape
        self.J = J
        self.Q = Q

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):        
        scattering = Scattering1D(self.J, self.shape, self.Q)
        return scattering(X)

In [437]:
pipe = Pipeline([('norm1', MinMaxScaler()), 
                 ('scat', Scattering1DTransformer(X_train.shape[1], J=4, Q=12)),
                 ('flatten', FlattenScattering(order0=False)),
                 #('prune', PruneScattering(q_features=90, q_keep=50)),
                 ('pca', PCA(n_components=40)),                 
                 #('norm2', MinMaxScaler()),
                 #('kbest', SelectKBest(f_regression, k=200)),
                 ('model', RandomForestRegressor(n_estimators=20, max_depth=20))
                ])

In [438]:
model = pipe.fit(X_train, y_train)

In [439]:
model.score(X_train, y_train)

0.8883710150195963

In [440]:
model.score(X_test, y_test)

0.2952016735548163

In [212]:
?Pipeline

In [200]:
# Set grid of hyper-parameters values to explore
param_grid = {'scat__J': [4], 
              'scat__Q': [12],
              'post__order0': [False],
              'model__n_components': [15]}

pipe = Pipeline([('norm1', MinMaxScaler()), 
                 ('scat', Scattering1DTransformer(X_train.shape[1])),
                 ('post', PostprocessScattering()),
                 ('norm2', MinMaxScaler()),
                 ('model', PLSRegression())])

grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='r2', return_train_score=True, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.1min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('norm1',
                                        MinMaxScaler(copy=True,
                                                     feature_range=(0, 1))),
                                       ('scat',
                                        Scattering1DTransformer(J=4, Q=12,
                                                                shape=1764)),
                                       ('post',
                                        PostprocessScattering(order0=False)),
                                       ('norm2',
                                        MinMaxScaler(copy=True,
                                                     feature_range=(0, 1))),
                                       ('model',
                                        PLSRegression(copy=True, max_iter=500,
                                                      n_components=2

In [198]:
# What is the "best" combination of hyper-parameters
grid_search.best_params_

{'model__n_components': 10, 'post__order0': False, 'scat__J': 3, 'scat__Q': 12}

In [202]:
# What is the "best" score
grid_search.best_score_

0.3082851827093354

In [191]:
print('R2 on traint set with best estimator: ', grid_search.best_estimator_.score(X_train, y_train))
print('R2 on test set with best estimator: ', grid_search.best_estimator_.score(X_test, y_test))

2982
R2 on traint set with best estimator:  0.2751052075602598
1988
R2 on test set with best estimator:  0.18531344438923703
