In [1]:
import sys
import os
# when python interpreter is different, add path
gems_tco_path = "/Users/joonwonlee/Documents/GEMS_TCO-1/src"
sys.path.append(gems_tco_path)
import matplotlib.pyplot as plt

# Data manipulation and analysis
import pandas as pd
import numpy as np
import pickle 
from collections import defaultdict

from pathlib import Path
import time
import json
from json import JSONEncoder

# Special functions and optimizations
from typing import Callable, Union, Tuple
from scipy.spatial.distance import cdist  # For space and time distance
from scipy.special import gamma, kv  # Bessel function and gamma function
from scipy.interpolate import splrep, splev

import torch
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from torchcubicspline import natural_cubic_spline_coeffs, NaturalCubicSpline

import GEMS_TCO
from GEMS_TCO import kernels 
from GEMS_TCO import orderings as _orderings
from GEMS_TCO import load_data

from GEMS_TCO import configuration as config


In [None]:
# conda activate faiss_env

!/opt/anaconda3/envs/faiss_env/bin/python /Users/joonwonlee/Documents/GEMS_TCO-1/src/GEMS_TCO/mymac_config.py --space "20,20" --days "0,31"


In [60]:
lat_lon_resolution = [8,8]
years = ['2024']
month_range =[7,8]
nheads = 10
mm_cond_number = 10 
v= 0.5

data_load_instance = load_data(config.mac_data_load_path)
df = data_load_instance.read_pickle(config.mac_estimates_day_path,config.mac_full_day_v05_pickle)
map, ord_mm, nns_map= data_load_instance.load_mm20k_data_bymonthyear( lat_lon_resolution= lat_lon_resolution, mm_cond_number=mm_cond_number,years_=years, months_=month_range)

df.head()

for day in range(1,2):
    params = list(df.iloc[day-1][:-1])
    params = torch.tensor(params, dtype=torch.float64, requires_grad=True)
    print(f'2024-07-{day+1}, data size per hour: { (int(158.7 / lat_lon_resolution[0] * (113.63 / lat_lon_resolution[0]))) }, smooth: {v}')
    print(f'mm_cond_number: {mm_cond_number},\ninitial parameters: \n {params}')
               
    idx_for_datamap= [ 8*(day),8*(day+1)]
    analysis_data_map, aggregated_data = data_load_instance.load_working_data_byday( map, ord_mm, nns_map, idx_for_datamap= idx_for_datamap)

2024-07-2, data size per hour: 281, smooth: 0.5
mm_cond_number: 10,
initial parameters: 
 tensor([ 2.4793e+01,  1.5845e+00,  1.7182e+00,  9.0885e-03, -1.0730e-01,
         1.3104e-01,  2.7172e+00], dtype=torch.float64, requires_grad=True)


In [55]:
instance = kernels.vecchia_experiment(0.5, analysis_data_map, aggregated_data,nns_map,mm_cond_number, nheads)
excat_ll = instance.full_likelihood(params, aggregated_data[:,:4], aggregated_data[:,2], instance.matern_cov_anisotropy_v05)

cov_map = instance.cov_structure_saver(params, instance.matern_cov_anisotropy_v05)
vecc_ll = instance.vecchia_may9(params, instance.matern_cov_anisotropy_v05 ,cov_map)
print(f'full: {excat_ll.item()}, vecc: {vecc_ll.item()}')   # (14435.97)

full: 4180.555600142574, vecc: 4252.477514877538


In [56]:
coarse_factor_head = 4 # 16:2 8:4  4:16, i expect 2:64, 1:128
coarse_factor_cond = 1
spline_instance = kernels.spline(epsilon = 0, coarse_factor_head=coarse_factor_head,coarse_factor_cond=coarse_factor_cond, smooth = v, input_map= analysis_data_map, aggregated_data= aggregated_data, nns_map=nns_map, mm_cond_number=10)
distances, non_zero_indices = spline_instance.precompute_coords_anisotropy(params, spline_instance.aggregated_data, spline_instance.aggregated_data)
# flat_distances = distances.flatten()
# spline_instance.max_distance = torch.max(distances).clone().detach()
# spline_instance.max_distance_len = len(flat_distances)
# spline_instance.spline_object = spline_instance.fit_cubic_spline(params)

spline_instance.nheads= 500

distances, non_zero_indices = spline_instance.precompute_coords_anisotropy(params, aggregated_data, aggregated_data)
spline_object_head = spline_instance.fit_cubic_spline( distances, spline_instance.coarse_factor_head)  # change here

spline_full = spline_instance.full_likelihood_using_spline(params,aggregated_data[:,:4], aggregated_data[:,2], distances, spline_object_head)


# 12663.4804
print( f'exact full: {excat_ll.item()}, spline_full: {spline_full.item()}' )


exact full: 4180.555600142574, spline_full: 4180.186564100173


In [None]:
coarse_factor_head = 4
coarse_factor_cond = 1
spline_instance = kernels.spline(epsilon = 0, coarse_factor_head=coarse_factor_head,coarse_factor_cond=coarse_factor_cond, smooth = 0.5, input_map= analysis_data_map, aggregated_data= aggregated_data, nns_map=nns_map, mm_cond_number=10)
distances, non_zero_indices = spline_instance.precompute_coords_anisotropy(params, spline_instance.aggregated_data, spline_instance.aggregated_data)
# flat_distances = distances.flatten()
# spline_instance.max_distance = torch.max(distances).clone().detach()
# spline_instance.max_distance_len = len(flat_distances)
# spline_instance.spline_object = spline_instance.fit_cubic_spline(params)

spline_instance.nheads= 50

cov_map = spline_instance.cov_structure_saver_using_spline(params)
vecc = spline_instance.vecchia_nll_using_spline(params, cov_map)
print(f' spline full likelihood: {spline_full.item()}, spline vecchia: {vecc.item()}' )

 spline full likelihood: 4180.186564100173, spline vecchia: 4235.268833591424


In [31]:
coarse_factor_head = 8 # 4:16 8:4
coarse_factor_cond = 1
spline_instance = kernels.spline(epsilon = 0, coarse_factor_head=coarse_factor_head,coarse_factor_cond=coarse_factor_cond, smooth = 0.5, input_map= analysis_data_map, aggregated_data= aggregated_data, nns_map=nns_map, mm_cond_number=10)

spline_instance.nheads= 10

cov_map = spline_instance.cov_structure_saver_using_spline(params)
vecc = spline_instance.vecchia_nll_using_spline(params, cov_map)
vecc

tensor(14636.4830, dtype=torch.float64, grad_fn=<AddBackward0>)

### Note that coarse_factor_head in vecchia model relies on nheads

In [57]:
coarse_factor_head = 1 # 4:16 8(284):4, 16(70):1
coarse_factor_cond = 1
spline_instance = kernels.spline(epsilon = 0, coarse_factor_head=coarse_factor_head,coarse_factor_cond=coarse_factor_cond, smooth = 0.5, input_map= analysis_data_map, aggregated_data= aggregated_data, nns_map=nns_map, mm_cond_number=10)

spline_instance.nheads= 70

cov_map_spline = spline_instance.cov_structure_saver_using_spline(params)
vecc = spline_instance.vecchia_nll_using_spline(params, cov_map_spline)


instance = kernels.vecchia_experiment(0.5, analysis_data_map, aggregated_data,nns_map,mm_cond_number, nheads)

cov_map = instance.cov_structure_saver(params, instance.matern_cov_anisotropy_v05)
vecc_ll = instance.vecchia_may9(params, instance.matern_cov_anisotropy_v05 ,cov_map)
print(f'vecc: {vecc_ll.item()}, spline vecc: {vecc.item()}')   # (14435.97)

vecc: 4252.477514877538, spline vecc: 4233.940743985809


likelihood 

# debug error when high resolution cov_1d returns nans
# Summary I have to make the element wise different smaller than 2.53e-7 to make
# likelihood difference smaller than 0.15

## I suggest 1000 for resolution 1250(4,4) and 5000 for (2,2) and 50,000 for (1,1)


resolution 3,3
10,000:   total diff   1.66       5.01e-9
100,000                868        2.6e-6

resolution 4,4  (160000**2/(10000**2)  1/256 from original)
#coarse factor 5 error coarse factor 10 okay
coarse_factor 100 took 18 sec       sum diff 0.167   1.67e-9
coarse_factor 1000 okay difference elementwise ( sum diff 0.2831, 2.83e-9 )
coarse_factor 10,000        sum difference 1.45 (   1.45/10000**2= 1.5e-8  )

resolution 6,6
100:     sum:0.028  1.315e-9
1000:   sum: 0.0314   1.47e-9
10000:  sum: -124    5.82e-6

resolution 10,10

coarse_factor 100     sum diff 0.02     8.5e-9
coarse_factor 1000    sum diff  -13.8154   -5.39 e-6

coarse_factor 10,000  sum diff 3793
coarse_factor 100,000 began to show difference at 10-4

resolution 20,20
coarse_facttor 100    sum diff 5.729   3.57e-5
coarse_factor 1000    sum diff  200

In [6]:
coarse_factor_head = 4 # 16:2 8:4  4:16, i expect 2:64, 1:128
coarse_factor_cond = 1
spline_instance = kernels.spline(epsilon = 0, coarse_factor_head=coarse_factor_head,coarse_factor_cond=coarse_factor_cond, smooth = v, input_map= analysis_data_map, aggregated_data= aggregated_data, nns_map=nns_map, mm_cond_number=10)

spline_instance.nheads= 500

distances, non_zero_indices = spline_instance.precompute_coords_anisotropy(params, aggregated_data, aggregated_data)


In [15]:
spline_object_head = spline_instance.fit_cubic_spline( distances, spline_instance.coarse_factor_head)  # change here
cov_1d = spline_object_head.evaluate(distances)
sigmasq, _, _, _, _, _, nugget = params
cov_matrix = cov_1d.reshape(distances.shape)
cov_matrix = cov_matrix * sigmasq
cov_matrix = cov_matrix + torch.eye(cov_matrix.shape[0], dtype=torch.float64) * nugget 
print(f'Covarinace matrix using spline:\n\n {cov_matrix}\n')

instance_2 = kernels.vecchia_experiment(v, analysis_data_map, aggregated_data,nns_map,mm_cond_number, nheads)
out = instance_2.matern_cov_anisotropy_kv(params, instance_2.aggregated_data, instance_2.aggregated_data)
print(f'Original covarinace matrix : \n\n {out}')

print(f'shape of the matrices: {cov_matrix.shape}')

print(f'sum of differences: {torch.sum ( cov_matrix-out )}')
print(f'element-wise difference on average: {torch.sum(cov_matrix-out)/ cov_matrix.shape[0]**2}')

Covarinace matrix using spline:

 tensor([[27.5107,  0.8990,  1.1548,  ...,  3.6048,  1.9665,  5.8334],
        [ 0.8990, 27.5107,  0.0942,  ...,  3.6373,  0.1284,  2.0618],
        [ 1.1548,  0.0942, 27.5107,  ...,  0.4144,  7.1918,  0.7017],
        ...,
        [ 3.6048,  3.6373,  0.4144,  ..., 27.5107,  0.7292, 13.2462],
        [ 1.9665,  0.1284,  7.1918,  ...,  0.7292, 27.5107,  1.3196],
        [ 5.8334,  2.0618,  0.7017,  ..., 13.2462,  1.3196, 27.5107]],
       dtype=torch.float64, grad_fn=<AddBackward0>)

Original covarinace matrix : 

 tensor([[27.5107,  0.8990,  1.1548,  ...,  3.6048,  1.9665,  5.8334],
        [ 0.8990, 27.5107,  0.0942,  ...,  3.6373,  0.1284,  2.0618],
        [ 1.1548,  0.0942, 27.5107,  ...,  0.4144,  7.1919,  0.7017],
        ...,
        [ 3.6048,  3.6373,  0.4144,  ..., 27.5107,  0.7292, 13.2462],
        [ 1.9665,  0.1284,  7.1919,  ...,  0.7292, 27.5107,  1.3196],
        [ 5.8334,  2.0618,  0.7017,  ..., 13.2462,  1.3196, 27.5107]],
       dtype=

optimization for full likelihood

In [None]:
coarse_factor_head = 4 # 16:2 8:4  4:16, i expect 2:64, 1:128
coarse_factor_cond = 1
spline_instance = kernels.spline(epsilon = 0, coarse_factor_head=coarse_factor_head,coarse_factor_cond=coarse_factor_cond, smooth = v, input_map= analysis_data_map, aggregated_data= aggregated_data, nns_map=nns_map, mm_cond_number=10)

spline_instance.nheads= 50
print(params)
# spline_instance = kernels.spline(epsilon = 1e-17, coarse_factor=5, k=3, smooth = 0.5, input_map= analysis_data_map, aggregated_data= aggregated_data, nns_map=nns_map, mm_cond_number=10)
# optimizer, scheduler =  instance.optimizer_fun(params, lr= 0.01 , betas=(0.9, 0.99), eps=1e-8, step_size= 5, gamma=0.1)    
optimizer, scheduler = spline_instance.optimizer_fun(params, lr=0.02, betas=(0.9, 0.99), eps=1e-8, step_size=100, gamma=0.2)  
out, epoch = spline_instance.run_full(params, aggregated_data,optimizer,scheduler, epochs=500)


tensor([ 2.4793e+01,  1.5845e+00,  1.7182e+00,  9.0885e-03, -1.0730e-01,
         1.3104e-01,  2.7172e+00], dtype=torch.float64, requires_grad=True)
Epoch 1, Gradients: [  -15.20851441   120.5063186      2.6793614    112.21640154
   415.09729463 -1298.56608861  -131.57867753]
 Loss: 4180.186564100173, Parameters: [ 2.47934437e+01  1.58452892e+00  1.71824777e+00  9.08850413e-03
 -1.07299447e-01  1.31037638e-01  2.71723866e+00]
Epoch 101, Gradients: [ -0.56453347  -1.35252707  -0.35120962  -0.17410995  -1.05352026
   1.46938781 -11.13395343]
 Loss: 3995.81722199123, Parameters: [25.65032925  0.81789756  1.33432273 -0.03404876 -0.20534873  0.23931403
  3.76331965]
Epoch 201, Gradients: [-4.73988425e-01 -3.07447383e+00 -2.67035741e-01  6.36760257e-03
 -8.58732676e-02  2.16389788e+00 -1.06408411e+01]
 Loss: 3993.538017034656, Parameters: [25.72579407  0.84119467  1.38558457 -0.03311623 -0.20346034  0.2292493
  3.96119062]
Epoch 301, Gradients: [ -0.47940358  -2.91583638  -0.25999449  -0.055

optimization for vecchia approximation

In [58]:
coarse_factor_head = 1 # 16:2 8:4  4:16, i expect 2:64, 1:128
coarse_factor_cond = 1
spline_instance = kernels.spline(epsilon = 0, coarse_factor_head=coarse_factor_head,coarse_factor_cond=coarse_factor_cond, smooth = v, input_map= analysis_data_map, aggregated_data= aggregated_data, nns_map=nns_map, mm_cond_number=10)


spline_instance.nheads= 70

optimizer, scheduler = spline_instance.optimizer_fun(params, lr=0.02, betas=(0.9, 0.99), eps=1e-8, step_size=100, gamma=0.2)  
out, epoch = spline_instance.fit_vecchia(params, optimizer,scheduler, epochs=500)

Epoch 1, Gradients: [   -7.67722426     2.83025242    16.85498825     2.64241634
   238.00750953 -1084.52291744   -61.3463364 ]
 Loss: 4233.940743985809, Parameters: [ 2.47934437e+01  1.58452892e+00  1.71824777e+00  9.08850413e-03
 -1.07299447e-01  1.31037638e-01  2.71723866e+00]
Epoch 101, Gradients: [-1.12733587 -0.03197711 -0.15042502  0.13406993  0.02084264  2.49632787
 -3.03525171]
 Loss: 4084.463421299732, Parameters: [26.04144235  0.82541716  0.93562984 -0.10381089 -0.27430643  0.43281586
  3.56584923]
Epoch 201, Gradients: [-1.02647531e+00 -2.26693468e-01 -2.97412747e-01  7.69974465e-04
  9.68885649e-03  2.45994888e+00 -2.89671405e+00]
 Loss: 4083.364952753561, Parameters: [26.28595707  0.83535842  0.95487828 -0.10441174 -0.27445142  0.42099665
  3.72530959]
Epoch 301, Gradients: [-1.00559833 -0.20466934 -0.26562644  0.01845811  0.03619645  2.29507976
 -2.88125286]
 Loss: 4081.932979925134, Parameters: [26.34813217  0.83956224  0.9621159  -0.10445452 -0.27459755  0.41696864
  3

# Saved spline class (May 26, 2025)

In [None]:
class spline(spatio_temporal_kernels):
    '''
    fit_cublic_spline() for each data shares the common locations. Even though the
    'distances' matrix is a function of parameters, we can make a common upper bound
    by putting range parameters 0.5, advections 0, beta 2.
    and we fit cubic_spline() for fixed smooth Matern model with range=1 and sigmasq=1.
    Essentially, we are approximating simple Matern model for v=1.
    
    Any change in parameters will be reflected through "distances" matrix. So,
    we define "distances" matrix for each epoch.
    
    '''
    def __init__(self, epsilon:float, coarse_factor_head:int, coarse_factor_cond:int, smooth:float, input_map: Dict[str, Any], aggregated_data:torch.Tensor, nns_map: np.ndarray, mm_cond_number:int):
        super().__init__(smooth, input_map, aggregated_data, nns_map, mm_cond_number)
        self.smooth = torch.tensor(smooth, dtype= torch.float64)
        
        self.epsilon = epsilon  # starting point for the spline fitting
        sample_params = [25, 0.5, 0.5, 0, 0, 2, 5] # just random nuumber to initialize spline
        sample_params = torch.tensor(sample_params, dtype=torch.float64, requires_grad=True)
        
        self.coarse_factor_head = coarse_factor_head
 
        self.coarse_factor_cond = coarse_factor_cond

        """
        Initialize the class with given parameters.
        Args:
            coarse_factor (int): Factor used for coarse-graining.
            smooth (float): Smooth parameter in Matern model.
            input_map (Dict[str, Any]): Dictionary containing input mappings.
            aggregated_data (torch.Tensor): Tensor containing aggregated data.
            nns_map (Dict[str, Any]): 2-d nd.array containing nearest neighbors mappings.
            mm_cond_number (int): Condition number for Vecchia approximation
        """

    def fit_cubic_spline(self, target_distances, coarse_factor:int=4):

        """
        Fit a natural cubic spline coefficients.

        Args:
            params (tuple): Parameters for the spline fitting.

        Returns:
            NaturalCubicSpline: The fitted spline object with coefficients.
        """

        def flat_distance_matrix(distances: torch.Tensor) -> torch.Tensor:
            n = distances.size(0)
            indices = torch.triu_indices(n, n, offset=1)
            upper_tri = distances[indices[0], indices[1]]
            unique_sorted = torch.unique(upper_tri, sorted=True)
            flat_distances = torch.cat([torch.tensor([0.0], device=unique_sorted.device), unique_sorted])
            max_distance = torch.max(flat_distances).clone().detach()
            len_distance_arr = len(flat_distances)
            
            return max_distance, len_distance_arr
        
        max_distance, len_distance_arr = flat_distance_matrix(target_distances)

        # fit_distances should be 1 d array to be used in natural_cubic_spline_coeffs
        fit_distances = torch.linspace(0, max_distance + 1e-6 , len_distance_arr// coarse_factor)
        non_zero_indices = fit_distances != 0
        out = torch.zeros_like(fit_distances, dtype= torch.float64)

        if torch.any(non_zero_indices):
            tmp = kv(self.smooth, torch.sqrt(fit_distances[non_zero_indices])).double().clone()
            out[non_zero_indices] = (1 * (2**(1-self.smooth)) / gamma(self.smooth) *
                                    (torch.sqrt(fit_distances[non_zero_indices]) ) ** self.smooth *
                                    tmp)
        out[~non_zero_indices] = 1

        # Compute spline coefficients. If input is tensor, so is output.
        # natural_cubic_spline_coeffs(t,x), t should be 1-d array (n,) and x should be (n,channels)
        # where channels reoresent number of features. out.unsquueze(1) makes (n,1).
        coeffs = natural_cubic_spline_coeffs(fit_distances, out.unsqueeze(1))
        # Create spline object
        spline = NaturalCubicSpline(coeffs)
        return spline


    def interpolate_cubic_spline(self, params:torch.Tensor, target_distances:torch.Tensor, spline_object) -> torch.Tensor:

        """
        Interpolate using the fitted cubic spline.
        Args:
            params (tuple): Parameters for the interpolation.
            target_distances (torch.Tensor): Distances to interpolate.
            spline_object (NaturalCubicSpline): The fitted spline object.

        Returns:
            torch.Tensor: Interpolated values.
        """
    
        sigmasq, _, _, _, _, _, nugget = params
        n = target_distances.size(0)
        indices = torch.triu_indices(n, n, offset=0)  # offset=0 to include diagonal

        # Evaluate spline only on upper triangle
        cov_upper = spline_object.evaluate(target_distances[indices[0], indices[1]])

        # Create empty matrix and fill upper triangle
        cov_matrix = torch.zeros_like(target_distances)

        # spline_object.evaluate return [N,1] 
        #print(cov_matrix.shape, cov_upper.shape, indices.shape, indices[0].shape)
        #print(indices)
        
        cov_matrix[indices[0], indices[1]] = cov_upper.view(-1)

        # Mirror to lower triangle
        cov_matrix = cov_matrix + cov_matrix.T - torch.diag(torch.diag(cov_matrix))

        # Apply scaling and nugget
        cov_matrix = cov_matrix * sigmasq
        cov_matrix = cov_matrix + torch.eye(n, dtype=torch.float64, device=cov_matrix.device) * nugget

        ''' 
        Before May26
        sigmasq, _, _, _, _, _, nugget = params
        cov_1d = spline_object.evaluate(target_distances)
        cov_matrix = cov_1d.reshape(target_distances.shape)
        cov_matrix = cov_matrix * sigmasq
        cov_matrix = cov_matrix + torch.eye(cov_matrix.shape[0], dtype=torch.float64) * nugget 
        '''
        return cov_matrix


    def full_likelihood_using_spline(self, params:torch.Tensor, input_data: torch.Tensor, y: torch.Tensor, target_distances:torch.Tensor, spline_object):
    
        cov_matrix = self.interpolate_cubic_spline(params, target_distances, spline_object)

        sign, log_det = torch.slogdet(cov_matrix)

        # if sign <= 0:
        #     raise ValueError("Covariance matrix is not positive definite")
        # Compute beta

        locs = input_data[:,:2]
        response = input_data[:,2]

        tmp1 = torch.matmul(locs.T, torch.linalg.solve(cov_matrix, locs))
        tmp2 = torch.matmul(locs.T, torch.linalg.solve(cov_matrix, response))
        beta = torch.linalg.solve(tmp1, tmp2)

        mu = torch.matmul(locs, beta)
        y_mu = response - mu
        quad_form = torch.matmul(y_mu, torch.linalg.solve(cov_matrix, y_mu))
        neg_log_lik = 0.5 * (log_det + quad_form)
        return  neg_log_lik

    def cov_structure_saver_using_spline(self, params: torch.Tensor) -> None:
        
        cov_map = defaultdict(lambda: defaultdict(dict))
        cut_line= self.nheads
        key_list = list(self.input_map.keys())

        for time_idx in range(0,3):
            current_array = self.input_map[key_list[time_idx]]

            # Use below when working on local computer to avoid singular matrix
            for index in range(cut_line, self.size_per_hour):
                current_row = current_array[index].reshape(1, -1)
                mm_neighbors = self.nns_map[index]
                past = list(mm_neighbors) 
                data_list = []

                if past:
                    data_list.append(current_array[past])

                if time_idx > 0:
                    one_hour_lag = self.input_map[key_list[time_idx - 1]]
                    data_list.append(one_hour_lag[past + [index], :])

                if time_idx > 1:
                    two_hour_lag = self.input_map[key_list[time_idx -2]]
                    data_list.append(two_hour_lag [past + [index], :])
                
                conditioning_data = torch.vstack(data_list) if data_list else torch.empty((0, current_row.shape[1]), dtype=torch.float64)
                aggregated_arr = torch.vstack((current_row, conditioning_data))
                locs = aggregated_arr[:, :2]

                target_distances_for_cond, non_zero_indices = self.precompute_coords_anisotropy(params, aggregated_arr,aggregated_arr)

                cond_spline_object = self.fit_cubic_spline(target_distances_for_cond, self.coarse_factor_cond )  # change here  
                cov_matrix = self.interpolate_cubic_spline(params, target_distances_for_cond, cond_spline_object)

                # if sign <= 0:
                #     raise ValueError("Covariance matrix is not positive definite")

                cov_yx = cov_matrix[0, 1:]
                sign, log_det = torch.slogdet(cov_matrix)
                tmp1 = torch.matmul(locs.T, torch.linalg.solve(cov_matrix, locs))
            
                # Mean and variance of y|x
                sigma = cov_matrix[0, 0]
                cov_xx = cov_matrix[1:, 1:]
                cov_xx_inv = torch.linalg.inv(cov_xx)
                cov_ygivenx = sigma - torch.matmul(cov_yx, torch.matmul(cov_xx_inv, cov_yx))
                cond_mean_tmp = torch.matmul(cov_yx, cov_xx_inv)
                log_det = torch.log(cov_ygivenx)
            
                cov_map[(time_idx,index)] = {
                    'tmp1': tmp1.clone().detach(),
                    'cov_xx_inv': cov_xx_inv.clone().detach(),
                    'cov_matrix': cov_matrix.clone().detach(),
                    'cov_ygivenx': cov_ygivenx.clone().detach(),
                    'cond_mean_tmp': cond_mean_tmp.clone().detach(),
                    'log_det': log_det.clone().detach(),
                    'locs': locs.clone().detach()
                }
        return cov_map

    def vecchia_nll_using_spline(self, params: torch.Tensor, cov_map:Dict[str,Any]) -> torch.Tensor:

        cut_line= self.nheads
        key_list = list(self.input_map.keys())
        neg_log_lik = 0.0
        heads = self.input_map[key_list[0]][:cut_line,:]

        for time_idx in range(1, len(self.input_map)):
            tmp = self.input_map[key_list[time_idx]][:cut_line,:]
            heads = torch.cat( (heads,tmp), dim=0)

        print(heads.shape )
        distances_heads, _ = self.precompute_coords_anisotropy(params, heads, heads)
        spline_object_head = self.fit_cubic_spline( distances_heads, self.coarse_factor_head)  # change here

        neg_log_lik += self.full_likelihood_using_spline(params, heads[:,:4], heads[:,2], distances_heads, spline_object_head)
    
        for time_idx in range(0,len(self.input_map)):
            current_np = self.input_map[key_list[time_idx]]

            for index in range(cut_line, self.size_per_hour):
                current_row = current_np[index].reshape(1, -1)
                current_y = current_row[0, 2]

                mm_neighbors = self.nns_map[index]
                past = list(mm_neighbors) 
                data_list = []

                if past:
                    data_list.append(current_np[past])  
                if time_idx < 2:
                    cov_matrix = cov_map[(time_idx,index)]['cov_matrix']
                    tmp1 = cov_map[(time_idx,index)]['tmp1']
                    cov_xx_inv = cov_map[(time_idx,index)]['cov_xx_inv']
                    cov_ygivenx = cov_map[(time_idx,index)]['cov_ygivenx']
                    cond_mean_tmp = cov_map[(time_idx,index)]['cond_mean_tmp']
                    log_det = cov_map[(time_idx,index)]['log_det']
                    locs = cov_map[(time_idx,index)]['locs']
                else:
                    cov_matrix = cov_map[(2,index)]['cov_matrix']
                    tmp1 = cov_map[(2,index)]['tmp1']
                    cov_xx_inv = cov_map[(2,index)]['cov_xx_inv']
                    cov_ygivenx = cov_map[(2,index)]['cov_ygivenx']
                    cond_mean_tmp = cov_map[(2,index)]['cond_mean_tmp']
                    log_det = cov_map[(2,index)]['log_det']
                    locs = cov_map[(2,index)]['locs']

                if time_idx >= 1:
                    one_hour_lag = self.input_map[key_list[time_idx - 1]]
                    past_conditioning_data = one_hour_lag[past + [index], :]
                    data_list.append(past_conditioning_data)
                
                if time_idx > 1:
                    two_hour_lag = self.input_map[key_list[time_idx - 2]]
                    past_conditioning_data = two_hour_lag[past + [index], :]
                    data_list.append(past_conditioning_data)
    
                if data_list:
                    conditioning_data = torch.vstack(data_list)
                else:
                    conditioning_data = torch.empty((0, current_row.shape[1]), dtype=torch.float64)

                aggregated_arr = torch.vstack((current_row, conditioning_data))
                aggregated_y = aggregated_arr[:, 2]

                cov_yx = cov_matrix[0, 1:]
                tmp2 = torch.matmul(locs.T, torch.linalg.solve(cov_matrix, aggregated_y))
                beta = torch.linalg.solve(tmp1, tmp2)
                mu = torch.matmul(locs, beta)
                mu_current = mu[0]
                mu_neighbors = mu[1:]
                
                # Mean and variance of y|x
                cond_mean = mu_current + torch.matmul(cond_mean_tmp, (aggregated_y[1:] - mu_neighbors))
                alpha = current_y - cond_mean
                quad_form = alpha**2 * (1 / cov_ygivenx)
                neg_log_lik += 0.5 * (log_det + quad_form)
        return neg_log_lik

    def compute_full_nll(self, params:torch.Tensor, aggregated_data, distances:torch.Tensor, spline_object): 
        nll = self.full_likelihood_using_spline( params, aggregated_data[:,:4], aggregated_data[:,2], distances, spline_object)
        return nll

    def compute_vecchia_nll(self, params:torch.Tensor): 
        cov_map = self.cov_structure_saver_using_spline(params)
        nll = self.vecchia_nll_using_spline(params, cov_map)
        return nll

    def optimizer_fun(self, params:torch.Tensor, lr:float =0.01, betas: tuple=(0.9, 0.8), eps:float=1e-8, step_size:int=40, gamma:float=0.5):
        optimizer = torch.optim.Adam([params], lr=lr, betas=betas, eps=eps)
        scheduler = StepLR(optimizer, step_size=step_size, gamma=gamma)  # Decrease LR by a factor of 0.1 every 10 epochs
        return optimizer, scheduler

    def run_full(self, params:torch.Tensor, aggregated_data, optimizer:torch.optim.Optimizer, scheduler:torch.optim.lr_scheduler, epochs:int=10 ):

        """
        Run the training loop for the full likelihood model.

        Args:
            params (torch.Tensor): Model parameters.
            optimizer (torch.optim.Optimizer): Optimizer for updating parameters.
            scheduler (torch.optim.lr_scheduler): Learning rate scheduler.
            epochs (int): Number of epochs to train.

        Returns:
            list: Final parameters and loss.
            int: Number of epochs run.
        """

        prev_loss= float('inf')
        # 1e-3: Faster convergence, slightly lower accuracy than 1e-4
        tol = 1e-3  # Convergence tolerance
        for epoch in range(epochs):  
            optimizer.zero_grad()  # Zero the gradients 
            distances, non_zero_indices = self.precompute_coords_anisotropy(params, aggregated_data[:,:4], aggregated_data[:,:4])
            spline_object = self.fit_cubic_spline( distances, self.coarse_factor_head)  # change here

            loss = self.compute_full_nll(params, aggregated_data, distances, spline_object)
            loss.backward()  # Backpropagate the loss

            # Gradient and Parameter Logging for every 10th epoch
            #if epoch % 10 == 0:
            #    print(f'Epoch {epoch+1}, Gradients: {params.grad.numpy()}\n Loss: {loss.item()}, Parameters: {params.detach().numpy()}')
            
            optimizer.step()  # Update the parameters
            scheduler.step()  # Update the learning rate

            # Convergence Check
            if abs(prev_loss - loss.item()) < tol:
                print(f"Converged at epoch {epoch}")
                print(f'Epoch {epoch+1}, : Loss: {loss.item()}, \n vecc Parameters: {params.detach().numpy()}')
                break

            prev_loss = loss.item()
        params = [torch.round(x*1000).detach().numpy()/1000 for x in params]
        loss = (torch.round(loss*1000)/1000).item()
        print(f'FINAL STATE: Epoch {epoch+1}, Loss: {loss}, \n vecc Parameters: {params}')
        return params + [loss], epoch

    def fit_vecchia(self, params:torch.Tensor, optimizer:torch.optim.Optimizer, scheduler:torch.optim.lr_scheduler, epochs:int=10 ):

        """
        Run the training loop for the full likelihood model.

        Args:
            params (torch.Tensor): Model parameters.
            optimizer (torch.optim.Optimizer): Optimizer for updating parameters.
            scheduler (torch.optim.lr_scheduler): Learning rate scheduler.
            epochs (int): Number of epochs to train.

        Returns:
            list: Final parameters and loss.
            int: Number of epochs run.
        """

        prev_loss= float('inf')
        # 1e-3: Faster convergence, slightly lower accuracy than 1e-4
        tol = 1e-3  # Convergence tolerance

        for epoch in range(epochs):  
            optimizer.zero_grad()  # Zero the gradients 
            # distance is a function of parameters
            # distances, non_zero_indices = self.precompute_coords_anisotropy(params, self.new_aggregated_data[:,:4], self.new_aggregated_data[:,:4])
            
            loss = self.compute_vecchia_nll(params)
            loss.backward()  # Backpropagate the loss

            # Gradient and Parameter Logging for every 10th epoch
            if epoch % 10 == 0:
                print(f'Epoch {epoch+1}, Gradients: {params.grad.numpy()}\n Loss: {loss.item()}, Parameters: {params.detach().numpy()}')
            
            # if epoch % 500 == 0:
            #     print(f'Epoch {epoch+1}, Gradients: {params.grad.numpy()}\n Loss: {loss.item()}, Parameters: {params.detach().numpy()}')
            
            optimizer.step()  # Update the parameters
            scheduler.step()  # Update the learning rate

            # Convergence Check
            if abs(prev_loss - loss.item()) < tol:
                print(f"Converged at epoch {epoch}")
                print(f'Epoch {epoch+1}, : Loss: {loss.item()}, \n vecc Parameters: {params.detach().numpy()}')
                break

            prev_loss = loss.item()
        params = [torch.round(x*1000).detach().numpy()/1000 for x in params]
        loss = (torch.round(loss*1000)/1000).item()
        print(f'FINAL STATE: Epoch {epoch+1}, Loss: {loss}, \n vecc Parameters: {params}')
        return params + [loss], epoch