In [1]:
import os
import sys
import torch
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm 
import time 
import gpytorch
from gpytorch.functions import pivoted_cholesky

from gpytorch.kernels import ScaleKernel, MaternKernel, RBFKernel
from gpytorch.priors import GammaPrior
from gpytorch.likelihoods import GaussianLikelihood
import torch
import gc

gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
# Set device and global dtype
device = "cuda:0"
global_dtype = torch.float32

# Ensure reproducibility
np.random.seed(42)
torch.manual_seed(42)

# Add project source path
notebook_dir = os.getcwd()
src_path = os.path.abspath(os.path.join(notebook_dir, '../code'))
if src_path not in sys.path:
    sys.path.append(src_path)

# Import custom modules
from gps import CholeskyGaussianProcess, IterativeGaussianProcess
from util import train, eval, plot_gpr_results, fetch_uci_dataset, memory_dump
from plotting import plot_gp_simple, plot_gp_sample, plot_gp_simple_regions

# Enable autoreloading of modules
%load_ext autoreload
%autoreload 2
train_x, train_y, test_x, test_y = fetch_uci_dataset('bike',r"C:\Users\fredw\chris\Research\softki\data\uci_datasets\uci_datasets\elevators\data.csv",train_frac=1/10,val_frac=0)
train_x = train_x.to(device)
train_y = train_y.to(device)
test_x = test_x.to(device)
print(train_x.shape)

    The default C++ compiler could not be found on your system.
    You need to either define the CXX environment variable or a symlink to the g++ command.
    For example if g++-8 is the command you can do
      import os
      os.environ['CXX'] = 'g++-8'
    
SIZE (16599, 19)
Dataset loaded
torch.Size([1659, 17])


In [2]:
base_kernel = MaternKernel(ard_num_dims=train_x.shape[-1], lengthscale_prior=GammaPrior(3.0, 6.0), nu=1.5)
kernel = ScaleKernel(base_kernel, outputscale_prior=GammaPrior(2.0, 0.15)).to(device)

cgp = CholeskyGaussianProcess(kernel=kernel, dtype=global_dtype, noise=0.4, device=device)


base_kernel = MaternKernel(ard_num_dims=train_x.shape[-1], lengthscale_prior=GammaPrior(3.0, 6.0), nu=1.5)
kernel = ScaleKernel(base_kernel, outputscale_prior=GammaPrior(2.0, 0.15)).to(device)

igp = IterativeGaussianProcess(kernel=kernel, noise=0.4, dtype=global_dtype, device=device,
                               cg_tol=1e-2, cg_max_iter=100, warm_start=False, num_probes=64,
                               precon_type="identity", trace_backend="Hutch",
                               verbose=False, track_iterations=False, 
                               pred_lanczos_rank=train_x.shape[0], compute_covariance=False)
cgp.fit(train_x,train_y)
igp.fit(train_x,train_y)

print(cgp.compute_mll(train_y))
print(igp.compute_mll(train_y))

tensor(2008.6356, device='cuda:0', grad_fn=<MulBackward0>)
tensor(2007.2786, device='cuda:0', grad_fn=<MulBackward0>)


  quadratic = y.T @ self.alpha


In [19]:
import time
import numpy as np
import torch
from tqdm import tqdm
import copy

def train_and_compare_gradients(igp, cgp, train_x, train_y, test_x, test_y, epochs=1, lr=0.01):
    # Define separate optimizers for both models
    cgp_optimizer = torch.optim.Adam([
        {'params': cgp.kernel.parameters()}, 
        {'params': [cgp.noise.u]}
    ], lr=lr)
    igp_optimizer = torch.optim.Adam([
        {'params': igp.kernel.parameters()}, 
        {'params': [igp.noise.u]}
    ], lr=lr)
    
    igp_grads = []
    cgp_grads = []
    igp_mll = []
    cgp_mll = []
    igp_rmse = []
    cgp_rmse = []
    
    for epoch in range(epochs):
        print(f"\nEpoch {epoch+1}/{epochs}")
        
        # --- CGP Update ---
        cgp_optimizer.zero_grad()
        cgp.fit(train_x, train_y)
        cgp_loss = cgp.compute_mll(train_y)
        print(cgp_loss)
        #print(cgp.kernel.outputscale)
        cgp_loss.backward()
        
        # Collect gradients for CGP kernel and noise parameters
        cgp_epoch_grads = {}
        for name, param in cgp.kernel.named_parameters():
            if param.requires_grad and param.grad is not None:
                cgp_epoch_grads[name] = param.grad.clone().cpu()
        if cgp.noise.u.grad is not None:
            cgp_epoch_grads["noise"] = cgp.noise.u.grad.clone().cpu()
        cgp_optimizer.step()
        
        # Save loss and error metrics (optional)
        cgp_mll.append(-cgp_loss.item())
        cgp_rmse.append(torch.mean(torch.abs(cgp.predict(test_x)[0].detach().cpu() - test_y)).item())
        cgp_grads.append(cgp_epoch_grads)
        
        # --- IGP Update ---
        igp.fit(train_x, train_y)
        igp_loss = igp.compute_mll(train_y)
        print(igp_loss)
        #print(igp.kernel.outputscale)

        igp_epoch_grads = igp.estimate_mll_gradient()
        
        # Convert gradients to CPU tensors or numpy arrays for consistency
        igp_epoch_grads_cpu = {}
        for key, grad in igp_epoch_grads.items():
            igp_epoch_grads_cpu[key] = grad.clone().cpu() if isinstance(grad, torch.Tensor) else torch.tensor(grad, dtype=torch.float32)
        
        igp_optimizer.zero_grad()
        for name, param in igp.kernel.named_parameters():
            if param.requires_grad and name in igp_epoch_grads:
                if param.grad is None:
                    param.grad = -igp_epoch_grads[name].to(param.device)
                else:
                    param.grad.data = -igp_epoch_grads[name].to(param.device)
        if "noise" in igp_epoch_grads:
            if igp.noise.u.grad is None:
                igp.noise.u.grad = -igp_epoch_grads["noise"].to(igp.noise.u.device)
            else:
                igp.noise.u.grad.data = -igp_epoch_grads["noise"].to(igp.noise.u.device)
        igp_optimizer.step()
        
        igp_mll.append(-igp_loss.item())
        igp_rmse.append(torch.mean(torch.abs(igp.predict(test_x)[0].detach().cpu() - test_y)).item())
        igp_grads.append(igp_epoch_grads_cpu)
        
        print(f"IGP MLL: {-igp_loss.item():.4f}, RMSE: {igp_rmse[-1]:.4f}")
        print(f"CGP MLL: {-cgp_loss.item():.4f}, RMSE: {cgp_rmse[-1]:.4f}")
    
    return {
        "igp_grads": igp_grads,
        "cgp_grads": cgp_grads,
        "igp_mll": igp_mll,
        "cgp_mll": cgp_mll,
        "igp_rmse": igp_rmse,
        "cgp_rmse": cgp_rmse,
        "igp_final": igp,
        "cgp_final": cgp
    }

def print_gradient_comparison(results, param_names=None):
    igp_grads = results["igp_grads"]
    cgp_grads = results["cgp_grads"]
    if param_names is None:
        param_names = list(igp_grads[0].keys())
    epochs = len(igp_grads)
    
    for param_name in param_names:
        print(f"\n--- Gradient Comparison for parameter: {param_name} ---")
        igp_param_values = []
        cgp_param_values = []
        
        for epoch in range(epochs):
            if param_name in igp_grads[epoch]:
                igp_grad = igp_grads[epoch][param_name]
                if isinstance(igp_grad, torch.Tensor):
                    igp_values = igp_grad.detach().flatten().numpy()
                else:
                    igp_values = np.array([igp_grad])
                igp_param_values.append(igp_values)
            else:
                igp_param_values.append(None)
            
            # For CGP, determine the matching key for the parameter
            cgp_key = None
            if param_name == "noise" and "noise" in cgp_grads[epoch]:
                cgp_key = "noise"
            else:
                cgp_key = next((k for k in cgp_grads[epoch].keys() if param_name in k), None)
            
            if cgp_key:
                cgp_grad = cgp_grads[epoch][cgp_key]
                if isinstance(cgp_grad, torch.Tensor):
                    cgp_values = cgp_grad.detach().flatten().numpy()
                else:
                    cgp_values = np.array([cgp_grad])
                cgp_param_values.append(cgp_values)
            else:
                cgp_param_values.append(None)
        
        if "lengthscale" in param_name.lower():
            print("Printing per-dimension gradients for ARD lengthscale parameter:")
            for epoch in range(epochs):
                igp_vals = igp_param_values[epoch]
                cgp_vals = cgp_param_values[epoch]
                if igp_vals is not None and cgp_vals is not None:
                    print(f"\nEpoch {epoch+1}:")
                    for idx, (i_val, c_val) in enumerate(zip(igp_vals, cgp_vals)):
                        rel_diff = np.abs(i_val - c_val) / (np.abs(c_val) + 1e-10)
                        print(f"  Dimension {idx}: IGP = {i_val:.4f}, CGP = {c_val:.4f}, Relative Difference = {rel_diff:.4f}")
                else:
                    print(f"Epoch {epoch+1}: Missing data for one of the methods.")
        else:
            is_scalar = all(val is not None and val.size == 1 for val in igp_param_values + cgp_param_values)
            if is_scalar:
                print("Epoch\tIGP Gradient\tCGP Gradient\tRelative Difference")
                for epoch in range(epochs):
                    igp_val = igp_param_values[epoch][0] if igp_param_values[epoch] is not None else None
                    cgp_val = cgp_param_values[epoch][0] if cgp_param_values[epoch] is not None else None
                    if igp_val is not None and cgp_val is not None:
                        rel_diff = np.abs(igp_val - cgp_val) / (np.abs(cgp_val) + 1e-10)
                        print(f"{epoch+1}\t{igp_val:.4f}\t\t{cgp_val:.4f}\t\t{rel_diff:.4f}")
                    else:
                        print(f"{epoch+1}\tMissing data")
            else:
                print("Non-scalar parameter gradients (displaying summary statistics per epoch):")
                for epoch in range(epochs):
                    if igp_param_values[epoch] is not None and cgp_param_values[epoch] is not None:
                        igp_mean = np.mean(igp_param_values[epoch])
                        igp_std = np.std(igp_param_values[epoch])
                        cgp_mean = np.mean(cgp_param_values[epoch])
                        cgp_std = np.std(cgp_param_values[epoch])
                        print(f"Epoch {epoch+1}:")
                        print(f"  IGP Gradient: mean = {igp_mean:.4f}, std = {igp_std:.4f}")
                        print(f"  CGP Gradient: mean = {cgp_mean:.4f}, std = {cgp_std:.4f}")
                    else:
                        print(f"Epoch {epoch+1}: Missing data for one of the methods.")
    
    print("\n=== Loss and Error Metrics ===")
    print("Epoch\tIGP MLL\t\tCGP MLL\t\tIGP RMSE\tCGP RMSE")
    for epoch in range(epochs):
        print(f"{epoch+1}\t{results['igp_mll'][epoch]:.4f}\t\t{results['cgp_mll'][epoch]:.4f}\t\t"
              f"{results['igp_rmse'][epoch]:.4f}\t\t{results['cgp_rmse'][epoch]:.4f}")

# Assuming train_x, train_y, test_x, test_y are already defined,
# and that MaternKernel, GammaPrior, ScaleKernel, CholeskyGaussianProcess,
# and IterativeGaussianProcess have been defined and imported appropriately.

base_kernel = MaternKernel(ard_num_dims=train_x.shape[-1],
                           lengthscale_prior=GammaPrior(3.0, 6.0),
                           nu=1.5)
# kernel = ScaleKernel(base_kernel).to(device) #outputscale_prior=GammaPrior(2.0, 0.15)
cgp = CholeskyGaussianProcess(kernel=base_kernel, dtype=global_dtype, noise=0.4, device=device)

base_kernel = MaternKernel(ard_num_dims=train_x.shape[-1],
                           lengthscale_prior=GammaPrior(3.0, 6.0),
                           nu=1.5)
# kernel = ScaleKernel(base_kernel).to(device) #outputscale_prior=GammaPrior(2.0, 0.15)
igp = IterativeGaussianProcess(kernel=base_kernel, noise=0.4, dtype=global_dtype, device=device,
                               cg_tol=1e-3, cg_max_iter=100, warm_start=False, num_probes=64,
                               precon_type="identity", trace_backend="Hutch",
                               verbose=False, track_iterations=False, 
                               pred_lanczos_rank=train_x.shape[0], compute_covariance=False)

results = train_and_compare_gradients(
    igp=igp, 
    cgp=cgp, 
    train_x=train_x, 
    train_y=train_y, 
    test_x=test_x, 
    test_y=test_y, 
    epochs=1,
    lr=0.01
)

print_gradient_comparison(results)


Epoch 1/1
tensor(2079.6448, device='cuda:0', grad_fn=<MulBackward0>)
tensor(2079.1553, device='cuda:0', grad_fn=<MulBackward0>)
IGP MLL: -2079.1553, RMSE: 0.4352
CGP MLL: -2079.6448, RMSE: 0.4203

--- Gradient Comparison for parameter: raw_lengthscale ---
Printing per-dimension gradients for ARD lengthscale parameter:

Epoch 1:
  Dimension 0: IGP = -39.3608, CGP = -55.4466, Relative Difference = 0.2901
  Dimension 1: IGP = -37.3631, CGP = -53.7132, Relative Difference = 0.3044
  Dimension 2: IGP = -37.5494, CGP = -52.3830, Relative Difference = 0.2832
  Dimension 3: IGP = -23.9274, CGP = -31.6835, Relative Difference = 0.2448
  Dimension 4: IGP = -33.2203, CGP = -48.0909, Relative Difference = 0.3092
  Dimension 5: IGP = -22.0128, CGP = -31.0588, Relative Difference = 0.2913
  Dimension 6: IGP = -26.7400, CGP = -37.1657, Relative Difference = 0.2805
  Dimension 7: IGP = -29.0024, CGP = -39.7394, Relative Difference = 0.2702
  Dimension 8: IGP = -20.0178, CGP = -27.5237, Relative Diffe