# Global Sensitivity analysis

GSA using sobol analysis (via SALib package) for GT Data

Flow - 
* Build GP surrogate model to predict output(s) - separate models
* Generate samples using SALib
* Evaluate samples using GP models
* analysis using SALib

In [20]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from botorch.utils.transforms import unnormalize, normalize
from botorch.models import SingleTaskGP
from botorch.models.model_list_gp_regression import ModelListGP
from gpytorch.mlls.sum_marginal_log_likelihood import SumMarginalLogLikelihood
from gpytorch.mlls.sum_marginal_log_likelihood import ExactMarginalLogLikelihood
from botorch import fit_gpytorch_mll
from botorch.models.transforms.outcome import Standardize
from torch.optim import Adam

In [2]:
filename = r'../data/olhs_run1.xlsx'
x_pd = pd.read_excel(filename, sheet_name='Initial Design (OLHS)', header=[0,1], index_col=[0])
y_pd = pd.read_excel(filename, sheet_name='bo_data', header=[0,1], index_col=[0])

dtype=torch.double

objective_properties = ['Polymer Solubility', 'Gelation Enthalpy', 'Shear Modulus']

x_np = x_pd.values
y_np = y_pd[objective_properties].values

x = torch.tensor(x_np, dtype=dtype)
y = torch.tensor(y_np, dtype=dtype)

x_bounds = np.array([[2000, 10000], [0, 100], [0, 40], [5000, 15000], [80, 100], [0,100], [60, 100], [70, 100]])
x_bounds = torch.tensor(x_bounds.T, dtype=dtype)

x = normalize(x, bounds=x_bounds)

Define and train surrogate model

In [3]:
models = []
for i in range(len(objective_properties)):
    models.append(
        SingleTaskGP(x, y[:,i].unsqueeze(-1), outcome_transform=Standardize(m=1))
    )
model = ModelListGP(*models)
mll = SumMarginalLogLikelihood(model.likelihood, model)
mll = mll.to(x)
fit_gpytorch_mll(mll)

SumMarginalLogLikelihood(
  (likelihood): LikelihoodList(
    (likelihoods): ModuleList(
      (0-2): 3 x GaussianLikelihood(
        (noise_covar): HomoskedasticNoise(
          (noise_prior): GammaPrior()
          (raw_noise_constraint): GreaterThan(1.000E-04)
        )
      )
    )
  )
  (model): ModelListGP(
    (models): ModuleList(
      (0-2): 3 x SingleTaskGP(
        (likelihood): GaussianLikelihood(
          (noise_covar): HomoskedasticNoise(
            (noise_prior): GammaPrior()
            (raw_noise_constraint): GreaterThan(1.000E-04)
          )
        )
        (mean_module): ConstantMean()
        (covar_module): ScaleKernel(
          (base_kernel): MaternKernel(
            (lengthscale_prior): GammaPrior()
            (raw_lengthscale_constraint): Positive()
          )
          (outputscale_prior): GammaPrior()
          (raw_outputscale_constraint): Positive()
        )
        (outcome_transform): Standardize()
      )
    )
    (likelihood): LikelihoodList

Define and train separate singleTaskGPs

In [21]:
def fit_model(model, train_x):
    mll = ExactMarginalLogLikelihood(likelihood=model.likelihood, model=model)

    optimizer = Adam([{"params": model.parameters()}], lr=0.1)

    loss_history = []

    NUM_EPOCHS = 500
    model.train()

    for epoch in range(NUM_EPOCHS):
        optimizer.zero_grad()
        output = model(train_x)
        loss = -mll(output, model.train_targets)
        loss.backward()
        optimizer.step()
        # loss_history.append(model.likelihood.noise.item())
        loss_history.append(loss.item())
        # if (epoch + 1 )%10 == 0:
        #     print(
        #         f"Epoch {epoch+1:>3}/{NUM_EPOCHS} - Loss: {loss.item():>4.3f} "
        #         f"noise: {model.likelihood.noise.item():>4.3f}"
        #     )
    return model, mll, loss_history

In [22]:
model_ps = SingleTaskGP(x, y[:, 0].unsqueeze(-1), outcome_transform=Standardize(m=1))
model_ps, mll_ps, loss_history_ps = fit_model(model=model_ps, train_x=x)

model_ge = SingleTaskGP(x, y[:, 1].unsqueeze(-1), outcome_transform=Standardize(m=1))
model_ge, mll_ge, loss_history_ge = fit_model(model=model_ge, train_x=x)

model_sm = SingleTaskGP(x, y[:, 2].unsqueeze(-1), outcome_transform=Standardize(m=1))
model_sm, mll_sm, loss_history_sm = fit_model(model=model_sm, train_x=x)

Generate samples 

In [23]:
import SALib as salib
from SALib.sample import saltelli
from SALib.analyze import sobol
from SALib.test_functions import Ishigami

In [25]:
problem = {
    'num_vars': 8,
    'names': ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8'],
    'bounds': [[0, 1],
               [0, 1],
               [0, 1],
               [0, 1],
               [0, 1],
               [0, 1],
               [0, 1],
               [0, 1]]}

param_values = saltelli.sample(problem, 1024)
param_values = torch.tensor(param_values, dtype=torch.double)

  param_values = saltelli.sample(problem, 1024)


In [8]:
model.eval()
posterior = model.posterior(param_values)
pred_mean = posterior.mean.detach().numpy()


In [17]:
Si = sobol.analyze(problem, pred_mean[:,0])
total_Si, first_Si, second_Si = Si.to_df()
total_Si

  names = list(pd.unique(groups))


Unnamed: 0,ST,ST_conf
x1,0.074288,0.0103
x2,0.036671,0.006598
x3,0.133297,0.016547
x4,0.164148,0.023795
x5,0.053961,0.008447
x6,0.056381,0.008054
x7,0.469286,0.056985
x8,0.298067,0.03882


In [18]:
Si = sobol.analyze(problem, pred_mean[:,1])
total_Si, first_Si, second_Si = Si.to_df()
total_Si

  names = list(pd.unique(groups))


Unnamed: 0,ST,ST_conf
x1,0.079919,0.014773
x2,0.139694,0.042752
x3,0.087224,0.016568
x4,0.104312,0.018602
x5,0.221498,0.038775
x6,0.25949,0.054228
x7,0.478685,0.068486
x8,0.23242,0.042334


In [19]:
Si = sobol.analyze(problem, pred_mean[:,2])
total_Si, first_Si, second_Si = Si.to_df()
total_Si

  names = list(pd.unique(groups))


Unnamed: 0,ST,ST_conf
x1,0.064525,0.01938
x2,0.06158,0.014571
x3,0.045992,0.01426
x4,0.221073,0.055426
x5,0.320151,0.068245
x6,0.440472,0.112106
x7,0.241582,0.04961
x8,0.091757,0.01886


How important is surroagte model accuracy? 

Observations - 
- AM:[SA+LA] ratio (hydrophobic), [AM+IonM]:[SA+LA] ratio (hydrophilic), solvent concentration, block size (hydrophilic) are important for polymer solubility
- AM:[SA+LA] ratio (hydrophobic), all parameters of hydrophilic segment, and solvent concentration are important for Gelation enthalpy
- all parameters of hydrophilic segment are important for shear modulus 

Evaluate using separate SingleTaskGP

In [26]:
model_ps.eval()
model_ge.eval()
model_sm.eval()

posterior_ps = model_ps.posterior(param_values)
pred_mean_ps = posterior_ps.mean.detach().numpy()

posterior_ge = model_ge.posterior(param_values)
pred_mean_ge = posterior_ge.mean.detach().numpy()

posterior_sm = model_sm.posterior(param_values)
pred_mean_sm = posterior_sm.mean.detach().numpy()

In [29]:
Si = sobol.analyze(problem, pred_mean_ge.flatten())
total_Si, first_Si, second_Si = Si.to_df()
total_Si

  names = list(pd.unique(groups))


Unnamed: 0,ST,ST_conf
x1,0.07993,0.017523
x2,0.13971,0.038288
x3,0.087232,0.013944
x4,0.104316,0.017624
x5,0.22151,0.037391
x6,0.259506,0.049676
x7,0.478671,0.066371
x8,0.232404,0.041975


No difference between results from separate SingleTaskGPs and that of ModuleListGP