In [1]:
import math
import torch
import numpy as np
import gpytorch
import pandas as pd
from matplotlib import pyplot as plt
import random

from gpytorch.models import ExactGP
from gpytorch.likelihoods import DirichletClassificationLikelihood
from gpytorch.means import ConstantMean
from gpytorch.kernels import ScaleKernel, RBFKernel

In [2]:
filename = r'../../data/olhs_run1.xlsx'
x_pd = pd.read_excel(filename, sheet_name='Initial Design (OLHS)', header=[0,1], index_col=[0])
y_pd = pd.read_excel(filename, sheet_name='feasibility', header=[0,1], index_col=[0])

In [3]:
xmeans = x_pd.mean(axis=0)
xstddv = x_pd.std(axis=0)
x_pd_normal = (x_pd - xmeans)/xstddv
x_pd = x_pd_normal

In [4]:
# Implement K-fold cross-validation - returns list - each fold arranged as indices (train_idx, test_idx)
def kfold_split(data, num_folds):
    fold_size = len(data) // num_folds
    # indices = np.arange(len(data)) 
    indices = np.random.permutation(len(data))
    folds = []
    for i in range(num_folds):
        test_indices = indices[i * fold_size: (i + 1) * fold_size]
        train_indices = np.concatenate([indices[:i * fold_size], indices[(i + 1) * fold_size:]])
        folds.append((train_indices, test_indices))
    return folds

#function to calculate class probabilities 
def class_probability(X, model, likl):
    with torch.no_grad():
        logit_dist = model(X)
        pred_dist = likl(logit_dist, noise=torch.zeros(X.shape[0]))
    samples = logit_dist.sample(torch.Size((256,))).exp()
    return pred_dist, (samples / samples.sum(-2, keepdim=True)).mean(0) # logit distribution, probability vectors

#funciton to count misclassifications
def class_count(pred, target):
    return np.sum(np.abs(pred - target))/len(pred)

In [5]:
# We will use the simplest form of GP model, exact inference
class DirichletGPModel(ExactGP):
    def __init__(self, train_x, train_y, likelihood, num_classes):
        super(DirichletGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = ConstantMean(batch_shape=torch.Size((num_classes,)))
        self.covar_module = ScaleKernel(
            RBFKernel(batch_shape=torch.Size((num_classes,))),
            batch_shape=torch.Size((num_classes,)),
        )

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

# function to optimize parameters of the classification GP - 
def train_cls_gp(model, likelihood, train_x, training_iter):
   # Find optimal model hyperparameters
    model.train()
    likelihood.train()

    # Use the adam optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=0.1)  # Includes GaussianLikelihood parameters

    # "Loss" for GPs - the marginal log likelihood
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

    for i in range(training_iter):
        # Zero gradients from previous iteration
        optimizer.zero_grad()
        # Output from model
        output = model(train_x)
        # Calc loss and backprop gradients
        loss = -mll(output, likelihood.transformed_targets).sum()
        loss.backward()
        optimizer.step()

    return model, likelihood

In [6]:
folds = kfold_split(x_pd.values, num_folds=20)
all_x = torch.tensor(x_pd.values, dtype=torch.float)
all_y = torch.tensor(y_pd.values, dtype=torch.long)

train_accuracy_history = []
val_accuracy_history = []

# train_class0_prob = []
# val_class0_prob = []

for train_idx, valdn_idx in folds:
    train_x, train_y = all_x[train_idx, :], all_y[train_idx, :].flatten()
    valdn_x, valdn_y = all_x[valdn_idx, :], all_y[valdn_idx, :].flatten()

    #initialize model and likelihood
    likelihood = DirichletClassificationLikelihood(train_y, learn_additional_noise=False, alpha_epsilon=1e-4)
    model = DirichletGPModel(train_x, likelihood.transformed_targets, likelihood, num_classes=likelihood.num_classes)

    #train model
    model, likelihood = train_cls_gp(model, likelihood, train_x, training_iter=50)

    model.eval()
    likelihood.eval()
    #prediction
    train_logit_dist, training_probs = class_probability(train_x, model, likelihood)
    val_logit_dist, val_probs = class_probability(valdn_x, model, likelihood)

    #misclassification count
    training_accuracy = 1 - class_count(training_probs.max(0)[1].numpy(), train_y.numpy())
    valdn_accuracy = 1 - class_count(val_probs.max(0)[1].numpy(), valdn_y.numpy())

    train_accuracy_history.append(training_accuracy)
    val_accuracy_history.append(valdn_accuracy)




In [7]:
val_accuracy_history

[0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0,
 0.0,
 1.0,
 0.0,
 1.0,
 1.0,
 1.0,
 0.0,
 1.0,
 1.0]

In [8]:
train_accuracy_history

[0.7894736842105263,
 1.0,
 0.8947368421052632,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.8421052631578947,
 1.0,
 1.0,
 0.8421052631578947,
 1.0,
 0.7894736842105263,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0]

In [9]:
folds

[(array([12, 19,  9,  0, 15,  7, 13, 14,  4, 18,  3, 10,  1, 16, 11,  8, 17,
          6,  5]),
  array([2])),
 (array([ 2, 19,  9,  0, 15,  7, 13, 14,  4, 18,  3, 10,  1, 16, 11,  8, 17,
          6,  5]),
  array([12])),
 (array([ 2, 12,  9,  0, 15,  7, 13, 14,  4, 18,  3, 10,  1, 16, 11,  8, 17,
          6,  5]),
  array([19])),
 (array([ 2, 12, 19,  0, 15,  7, 13, 14,  4, 18,  3, 10,  1, 16, 11,  8, 17,
          6,  5]),
  array([9])),
 (array([ 2, 12, 19,  9, 15,  7, 13, 14,  4, 18,  3, 10,  1, 16, 11,  8, 17,
          6,  5]),
  array([0])),
 (array([ 2, 12, 19,  9,  0,  7, 13, 14,  4, 18,  3, 10,  1, 16, 11,  8, 17,
          6,  5]),
  array([15])),
 (array([ 2, 12, 19,  9,  0, 15, 13, 14,  4, 18,  3, 10,  1, 16, 11,  8, 17,
          6,  5]),
  array([7])),
 (array([ 2, 12, 19,  9,  0, 15,  7, 14,  4, 18,  3, 10,  1, 16, 11,  8, 17,
          6,  5]),
  array([13])),
 (array([ 2, 12, 19,  9,  0, 15,  7, 13,  4, 18,  3, 10,  1, 16, 11,  8, 17,
          6,  5]),
  array([14]

In [10]:
# average accuracy of the model
mean_train_accuracy = np.mean(np.array(train_accuracy_history))
mean_val_accuracy = np.mean(np.array(val_accuracy_history))
print('Avarge accuracy on training data - ', mean_train_accuracy)
print('Average accuracy on validation data - ', mean_val_accuracy)

Avarge accuracy on training data -  0.9578947368421054
Average accuracy on validation data -  0.7


**Comment on cross-validation results**

Data is biased. Only 6 data points are infeasible. Split should be done according to the bias?