In [1]:
import os
import torch
import gpytorch
import pandas as pd
import numpy as np
from scipy.stats import norm



In [2]:
# torch.float64 to run in double precision, torch.float32 for single
dtype = torch.float64

# 'cuda' for GPU, 'cpu' for CPU
device = 'cuda'

In [3]:
import os
if not os.path.exists('gpy_data.pth'):
    raise RuntimeError("Run gpy_wenda_save_data.ipynb first!")
    
train_x, train_y, test_x, test_y = torch.load('gpy_data.pth')

In [4]:
train_x = train_x.to(device=device, dtype=dtype)
train_y = train_y.to(device=device, dtype=dtype)
test_x = test_x.to(device=device, dtype=dtype)
test_y = test_y.to(device=device, dtype=dtype)

print(train_x.shape, train_y.shape, test_x.shape, test_y.shape)

torch.Size([1866, 12979]) torch.Size([1866]) torch.Size([1001, 12979]) torch.Size([1001])


In [22]:
import math
# Initialize model and likelihood

class ExactGPModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.LinearKernel()
  
    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
    
likelihood = gpytorch.likelihoods.GaussianLikelihood(
    noise_constraint=gpytorch.constraints.Positive(),
).to(device=device, dtype=dtype)

model = ExactGPModel(train_x, train_y, likelihood).to(device=device, dtype=dtype)

In [23]:
from tqdm.notebook import tqdm
from LBFGS import FullBatchLBFGS

def train_model_bfgs(model, likelihood, x, y, learning_rate,
                training_iter=20):
    lbfgs = FullBatchLBFGS(model.parameters(), lr=learning_rate)
    
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
    
    def closure():
        model.zero_grad()
        output = model(x)
        loss = -mll(output, y)
        return loss

    loss = closure()
    loss.backward()

    train_iter = tqdm(range(training_iter))
    for i in train_iter:
        options = {"closure": closure, "current_loss": loss, "max_ls": 10}
        loss, _, lr, _, F_eval, G_eval, _, fail = lbfgs.step(options)
        train_iter.set_postfix({'loss': loss.item(), 'fail': fail})
        
        if fail:
            print('Convergence reached!')
            break
    
    return model, likelihood

In [24]:
model.train()
likelihood.train()

with gpytorch.settings.max_cholesky_size(100000):
    model, likelihood = train_model_bfgs(
        model, likelihood, train_x, train_y, learning_rate=1., training_iter=30
    )

model.eval()
likelihood.eval()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=30.0), HTML(value='')))




GaussianLikelihood(
  (noise_covar): HomoskedasticNoise(
    (raw_noise_constraint): Positive()
  )
)

In [25]:
# GPy learned:
## noise = 3.277223830593155e-15
## linear variance = 4.5426621979927965e-05

print('likelihood noise', likelihood.noise)
print('linear kernel variance', model.covar_module.variance)

likelihood noise tensor([3.5572e-14], device='cuda:0', dtype=torch.float64,
       grad_fn=<SoftplusBackward>)
linear kernel variance tensor([[4.5330e-05]], device='cuda:0', dtype=torch.float64,
       grad_fn=<SoftplusBackward>)


In [26]:
# Save state dict

gene_number = 2
feature_model_format = 'model_{0:05d}'
output_dir = os.path.join("gpytorch_feature_models", feature_model_format.format(gene_number))
torch.save(model.state_dict(),os.path.join(output_dir, "state_dict.pth"))

In [27]:
# Get confidence score based on CDF of true target value on GP model
def getConfidence(model, x, y):
    with gpytorch.settings.fast_pred_var():
        f_preds = likelihood(model(x))
    mu = f_preds.mean
    sigma_sq = f_preds.variance
    sigma_sq = torch.sqrt(sigma_sq)
    res_normed = (y - mu) / sigma_sq
    res_normed = res_normed.cpu().detach().numpy()
    confidences = (1 - abs(norm.cdf(res_normed) - norm.cdf(-res_normed)))
    mu = mu.cpu().detach().numpy()
    sigma_sq = sigma_sq.cpu().detach().numpy()
    return mu, sigma_sq, confidences

In [28]:
# Write out confidence scores and predicted means and variances on target data
mean, var, conf = getConfidence(model, test_x, test_y)
print(conf.mean())

0.5857026258402954


In [29]:
conf_file = os.path.join(output_dir, "confidences.txt")
np.savetxt(conf_file, conf, fmt='%.10f')
mean_file = os.path.join(output_dir, "predicted_means.txt")
np.savetxt(mean_file, mean, fmt='%.5f')
var_file = os.path.join(output_dir, "predicted_variances.txt")
np.savetxt(var_file, var)