In [2]:
# Standard libraries
import sys
# Add your custom path
gems_tco_path = "/Users/joonwonlee/Documents/GEMS_TCO-1/src"
sys.path.append(gems_tco_path)
import logging
import argparse # Argument parsing

# Data manipulation and analysis
import pandas as pd
import numpy as np
import pickle
import torch
import torch.optim as optim
import copy                    # clone tensor
import time

# Custom imports
import GEMS_TCO
from GEMS_TCO import kernels
from GEMS_TCO import data_preprocess 
from GEMS_TCO import kernels 
from GEMS_TCO import orderings as _orderings 
from GEMS_TCO import load_data
from GEMS_TCO import alg_optimization, alg_opt_Encoder
from GEMS_TCO import configuration as config

from typing import Optional, List, Tuple
from pathlib import Path
import typer
import json
from json import JSONEncoder

from GEMS_TCO.data_loader import load_data2

In [171]:
import torch
import numpy as np
import scipy.optimize as opt
from scipy.spatial.distance import cdist
from numpy.linalg import det, inv

# --- 0. Global Parameters and Utility Functions ---
N_SPATIAL_POINTS = 1120
N_DAYS = 31
N_HOURS_PER_DAY = 8
N_FEATURES = 4
GRID_X = 40  
GRID_Y = 28  
LAT_MIN, LAT_MAX = 0, 25
LON_MIN, LON_MAX = 113, 133
BASE_DATE = '2024_07_y24m07day' 

# Exponential Kernel Parameters (Targets)
SIGMA2 = 42  # TARGET Variance (Parameter to be estimated)
RANGE_A = 2.0  # FIXED Range
NUGGET = 0.0   # FIXED Nugget (Corrected from integer 0 to float 0.0)

# Ozone Value Parameters
OZONE_MEAN = 240.0

def exponential_covariance(distances, sigma2, a, nugget):
    """Exponential covariance function (Matern nu=0.5)."""
    # Cov(h) = sigma^2 * exp(-h/a)
    cov = sigma2 * np.exp(-distances / a)
    # Add nugget effect to the diagonal (where distance h=0)
    if distances.shape[0] == distances.shape[1]:
        cov[np.diag_indices_from(distances)] += nugget
    return cov

# üí° CHANGE 1: Function now accepts only one parameter (sigma2)
def neg_log_likelihood_sigma2(params_sigma2, distances, z, fixed_a, fixed_nugget):
    """
    Calculates the negative log-likelihood for the single parameter (sigma2).
    """
    sigma2 = params_sigma2[0]
    a = fixed_a
    nugget = fixed_nugget
    N = z.size
    
    # Enforce bounds for the single parameter
    if sigma2 <= 1e-6:
        return 1e15 
    
    C = exponential_covariance(distances, sigma2, a, nugget)
    C = C + np.eye(N) * 1e-6

    try:
        sign, log_det = np.linalg.slogdet(C)
        if sign <= 0:
             return 1e15 
        
        C_inv = inv(C)
        quad_term = z.T @ C_inv @ z
        
        neg_LL = 0.5 * log_det + 0.5 * quad_term
        return neg_LL
        
    except np.linalg.LinAlgError:
        return 1e15

def generate_ozone_data_map(coords, sigma2, a, nugget, mean, time_index):
    """Generates one hour of spatially correlated Gaussian data."""
    n_points = coords.shape[0]
    distances = cdist(coords, coords, metric='euclidean')
    Cov = exponential_covariance(distances, sigma2, a, nugget)
    Cov = (Cov + Cov.T) / 2
    
    try:
        L = np.linalg.cholesky(Cov + 1e-6 * np.eye(n_points))
    except np.linalg.LinAlgError:
        return np.zeros((n_points, N_FEATURES))

    W = np.random.normal(0, 1, size=(n_points, 1))
    Z_correlated = L @ W
    
    ozone_values = mean + Z_correlated
    
    data_np = np.zeros((n_points, N_FEATURES))
    data_np[:, 0:1] = ozone_values             
    data_np[:, 1] = coords[:, 1] * 10 + 2      
    data_np[:, 2] = coords[:, 0] * 40 + 250    
    data_np[:, 3] = time_index                 

    return data_np

# --- 1. Data Generation Execution ---
df_day_aggregated_list = []
df_day_map_list = []

print("--- Starting Data Generation ---")
lat_coords = np.linspace(LAT_MIN, LAT_MAX, GRID_Y)
lon_coords = np.linspace(LON_MIN, LON_MAX, GRID_X)
coords_latlon = np.array([[lat, lon] for lat in lat_coords for lon in lon_coords])

for i in range(N_DAYS):
    cur_map = {}
    cur_df_list = []
    day_str = str(i+1).zfill(2)
    
    for j in range(N_HOURS_PER_DAY):
        time_index = 21.0 + j
        
        data_np = generate_ozone_data_map(
            coords_latlon, SIGMA2, RANGE_A, NUGGET, OZONE_MEAN, time_index
        )
        data_tensor = torch.tensor(data_np, dtype=torch.float)
        
        hour_str = str(j).zfill(2) 
        key = f'{BASE_DATE}{day_str}_hm{hour_str}:53'
        
        cur_map[key] = data_tensor
        cur_df_list.append(data_tensor)

    cur_df_aggregated = torch.cat(cur_df_list, dim=0)
    df_day_aggregated_list.append(cur_df_aggregated)
    df_day_map_list.append(cur_map)

print("--- Data Generation Complete ---")


# --- 2. MLE Execution (Optimization for SIGMA2 only) ---

# Extract data for fitting
data_to_fit = df_day_aggregated_list[0][:N_SPATIAL_POINTS, :] 
z_data = data_to_fit[:, 0].numpy()
coordinates = coords_latlon[:, [1, 0]] 

distances = cdist(coordinates, coordinates, metric='euclidean')
z_centered = z_data - np.mean(z_data)

# üí° CHANGE 2: Initial Guess (p0) contains only SIGMA2
# We start slightly off the target value of 31.0 for a true optimization test.
p0 = np.array([SIGMA2 + 5]) 

print("\n--- Starting MLE Optimization (SciPy - Estimating SIGMA2 Only) ---")

# üí° CHANGE 3: Bounds contain only the bounds for SIGMA2
bounds = opt.Bounds([0.1], [100.0])

# Perform the minimization. Note the new function name and extra arguments (fixed_a, fixed_nugget).
result = opt.minimize(
    neg_log_likelihood_sigma2, 
    p0, 
    args=(distances, z_centered, RANGE_A, NUGGET), # Pass fixed parameters as args
    method='L-BFGS-B', 
    bounds=bounds,
    options={'disp': True, 'ftol': 1e-6}
)


# --- 3. Display Results ---

if result.success:
    fitted_sigma2 = result.x[0]

    print(f"\nOriginal Parameters (Used for Generation): Range (a)={RANGE_A}, Variance (œÉ¬≤)={SIGMA2}, Nugget (Œ∑¬≤)={NUGGET}")
    print("\nFitted Parameter (SciPy MLE):")
    print(f"  * Variance (œÉ¬≤): {fitted_sigma2:.3f} (Target: {SIGMA2})")
    print(f"  * Range (a): {RANGE_A} (FIXED)")
    print(f"  * Nugget (Œ∑¬≤): {NUGGET} (FIXED)")
    print(f"  * Optimization Success: {result.success}")
    print(f"  * Final -LL Value: {result.fun:.2f}")
else:
    print("\nOptimization failed.")
    print(f"Reason: {result.message}")

print("-------------------------------------------------------")

--- Starting Data Generation ---
--- Data Generation Complete ---

--- Starting MLE Optimization (SciPy - Estimating SIGMA2 Only) ---

Original Parameters (Used for Generation): Range (a)=2.0, Variance (œÉ¬≤)=42, Nugget (Œ∑¬≤)=0.0

Fitted Parameter (SciPy MLE):
  * Variance (œÉ¬≤): 41.566 (Target: 42)
  * Range (a): 2.0 (FIXED)
  * Nugget (Œ∑¬≤): 0.0 (FIXED)
  * Optimization Success: True
  * Final -LL Value: 2046.50
-------------------------------------------------------


# LBFGS VS ADAMS FOR SIGMA^2

In [179]:
import torch
import numpy as np
import torch.optim as optim
from scipy.spatial.distance import cdist
# Removed 'scipy.optimize as opt' and 'from numpy.linalg import inv'

# --- 0. Global Parameters and Utility Functions ---
N_SPATIAL_POINTS = 1000
N_DAYS = 31
N_HOURS_PER_DAY = 8
N_FEATURES = 4
GRID_X = 50 
GRID_Y = 20  
LAT_MIN, LAT_MAX = 0, 5
LON_MIN, LON_MAX = 113, 123
BASE_DATE = '2024_07_y24m07day' 

# Exponential Kernel Parameters (Targets)
SIGMA2_TRUE = 30.0 # TARGET Variance
RANGE_A = 1   # FIXED Range
NUGGET = 3.0     # FIXED Nugget

# Optimization Setup
ADAM_ITERATIONS = 500
ADAM_LEARNING_RATE = 0.01

# L-BFGS Setup
LBFGS_MAX_STEPS = 50 
# üí° CORRECTED PARAMETER NAME: max_eval controls function evaluations including line search
LBFGS_MAX_EVAL = 50 

OZONE_MEAN = 240.0

# --- COVARIANCE FUNCTIONS ---

def exponential_covariance_numpy(distances, sigma2, a, nugget):
    """Exponential covariance function (NumPy for Generation)."""
    # Cov(h) = sigma^2 * exp(-h/a)
    cov = sigma2 * np.exp(-distances / a)
    if distances.shape[0] == distances.shape[1]:
        cov[np.diag_indices_from(distances)] += (nugget + 1e-6)
    return cov

def exponential_covariance_torch(distances_torch, sigma2, a, nugget):
    """Exponential covariance function (PyTorch for Optimization)."""
    cov = sigma2 * torch.exp(-distances_torch / a)
    
    # Add nugget effect + jitter to the diagonal
    if distances_torch.shape[0] == distances_torch.shape[1]:
        jitter = 1e-6 
        diag_mask = torch.eye(cov.shape[0], device=cov.device)
        cov = cov + diag_mask * (nugget + jitter)
    return cov

# --- SHARED NLL Function for PyTorch (Adam and LBFGS) ---
def neg_log_likelihood_torch(raw_params_sigma2, distances_torch, z_centered_torch, fixed_a, fixed_nugget):
    """Calculates -LL for PyTorch (using square reparameterization)."""
    # Reparameterization: sigma2 = raw_params_sigma2^2
    sigma2 = raw_params_sigma2.pow(2).squeeze()
    
    C = exponential_covariance_torch(distances_torch, sigma2, fixed_a, fixed_nugget)
    
    try:
        # Use PyTorch linear algebra functions: Cholesky decomposition
        L = torch.linalg.cholesky(C)
        log_det = 2.0 * torch.sum(torch.log(torch.diag(L)))
        alpha = torch.linalg.solve(C, z_centered_torch.unsqueeze(1))
        quad_term = z_centered_torch.unsqueeze(0) @ alpha
        neg_LL = 0.5 * log_det + 0.5 * quad_term.squeeze()
        return neg_LL
    except RuntimeError:
        return torch.tensor(1e15, device=C.device)


# --- Data Generation Function ---
def generate_ozone_data_map(coords, sigma2, a, nugget, mean, time_index):
    n_points = coords.shape[0]
    distances = cdist(coords, coords, metric='euclidean')
    Cov = exponential_covariance_numpy(distances, sigma2, a, nugget) 
    Cov = (Cov + Cov.T) / 2
    
    try:
        L = np.linalg.cholesky(Cov)
    except np.linalg.LinAlgError:
        return np.zeros((n_points, N_FEATURES))

    W = np.random.normal(0, 1, size=(n_points, 1))
    Z_correlated = L @ W
    ozone_values = mean + Z_correlated
    
    data_np = np.zeros((n_points, N_FEATURES))
    data_np[:, 0:1] = ozone_values             
    data_np[:, 1] = coords[:, 1] * 10 + 2      
    data_np[:, 2] = coords[:, 0] * 40 + 250    
    data_np[:, 3] = time_index                 
    return data_np


# --- 1. Data Generation Execution ---
df_day_aggregated_list = []
print("--- Starting Data Generation ---")
lat_coords = np.linspace(LAT_MIN, LAT_MAX, GRID_Y)
lon_coords = np.linspace(LON_MIN, LON_MAX, GRID_X)
coords_latlon = np.array([[lat, lon] for lat in lat_coords for lon in lon_coords])

# Generate only one hour of data for fitting the spatial model
data_np = generate_ozone_data_map(
    coords_latlon, SIGMA2_TRUE, RANGE_A, NUGGET, OZONE_MEAN, 21.0
)
df_day_aggregated_list.append(torch.tensor(data_np, dtype=torch.float))

print("--- Data Generation Complete ---")

# --- 2. Data Preparation ---
data_to_fit = df_day_aggregated_list[0][:N_SPATIAL_POINTS, :] 
z_data = data_to_fit[:, 0].numpy()
coordinates = coords_latlon[:, [1, 0]] 
distances_np = cdist(coordinates, coordinates, metric='euclidean')
z_centered_np = z_data - np.mean(z_data)

# Convert to Torch Tensors
distances_torch = torch.tensor(distances_np, dtype=torch.float)
z_centered_torch = torch.tensor(z_centered_np, dtype=torch.float)

# --- Initial Parameter Setup (Shared) ---
# Initial guess for raw_sigma2: sqrt(target + 5) = sqrt(47)
raw_sigma2_start = np.sqrt(SIGMA2_TRUE + 5) 

# ----------------------------------------------------
# A. Optimization with L-BFGS (PyTorch)
# ----------------------------------------------------

# Reset parameters for LBFGS
raw_params_lbfgs = torch.tensor(
    [raw_sigma2_start], 
    dtype=torch.float, 
    requires_grad=True
)

optimizer_lbfgs = optim.LBFGS(
    [raw_params_lbfgs], 
    lr=1.0, 
    max_iter=LBFGS_MAX_STEPS,
    max_eval=LBFGS_MAX_EVAL # üí° CORRECTED: Use max_eval instead of max_ls
)

final_loss_lbfgs = torch.tensor(0.0)
print("\n--- A. Starting MLE Optimization (PyTorch L-BFGS) ---")

# L-BFGS requires a "closure" function
def closure_lbfgs():
    optimizer_lbfgs.zero_grad()
    loss = neg_log_likelihood_torch(raw_params_lbfgs, distances_torch, z_centered_torch, RANGE_A, NUGGET)
    if not torch.isinf(loss) and not torch.isnan(loss):
        loss.backward()
    return loss

# L-BFGS Optimization Loop
for step in range(LBFGS_MAX_STEPS):
    loss = optimizer_lbfgs.step(closure_lbfgs)
    final_loss_lbfgs = loss
    
    if (step + 1) % 5 == 0: 
        current_sigma2 = raw_params_lbfgs.pow(2).item()
        grad_value = raw_params_lbfgs.grad.item() if raw_params_lbfgs.grad is not None else 0.0
        print(f"LBFGS Step {step + 1}/{LBFGS_MAX_STEPS}, NLL: {loss.item():.2f}, Sigma2: {current_sigma2:.3f}, Grad: {grad_value:.4f}")

# ----------------------------------------------------
# B. Optimization with Adam (PyTorch)
# ----------------------------------------------------

# Reset parameters for Adam (Use the same start point)
raw_params_adam = torch.tensor(
    [raw_sigma2_start], 
    dtype=torch.float, 
    requires_grad=True
)

optimizer_adam = optim.Adam(
    [raw_params_adam], 
    lr=ADAM_LEARNING_RATE
)

final_loss_adam = torch.tensor(0.0)
print(f"\n--- B. Starting MLE Optimization (PyTorch Adam) ---")

# Adam Optimization Loop
for epoch in range(ADAM_ITERATIONS):
    optimizer_adam.zero_grad()
    
    loss = neg_log_likelihood_torch(raw_params_adam, distances_torch, z_centered_torch, RANGE_A, NUGGET)
    
    if torch.isinf(loss) or torch.isnan(loss):
        loss = torch.tensor(1e15, device=loss.device)
        break

    loss.backward()
    optimizer_adam.step()
    final_loss_adam = loss
    
    if (epoch + 1) % 50 == 0: 
        current_sigma2 = raw_params_adam.pow(2).item()
        grad_value = raw_params_adam.grad.item() if raw_params_adam.grad is not None else 0.0
        print(f"Adam Epoch {epoch + 1}/{ADAM_ITERATIONS}, NLL: {loss.item():.2f}, Sigma2: {current_sigma2:.3f}, Grad: {grad_value:.4f}")

# ----------------------------------------------------
# 3. Display Results
# ----------------------------------------------------
print("\n" + "="*50)
print(f"TARGET PARAMETERS: Variance (œÉ¬≤)={SIGMA2_TRUE}, Range (a)={RANGE_A}, Nugget (Œ∑¬≤)={NUGGET}")
print("="*50)

# L-BFGS Results
fitted_sigma2_lbfgs = raw_params_lbfgs.pow(2).detach().numpy().item()
print("‚ú® PyTorch L-BFGS Results:")
print(f"  * Fitted Variance (œÉ¬≤): {fitted_sigma2_lbfgs:.3f} (Target: {SIGMA2_TRUE})")
print(f"  * Final -LL Value: {final_loss_lbfgs.item():.2f}")
print(f"  * Optimization Steps: {LBFGS_MAX_STEPS} steps")

# Adam Results
fitted_sigma2_adam = raw_params_adam.pow(2).detach().numpy().item()
print("\nüöÄ PyTorch Adam Results:")
print(f"  * Fitted Variance (œÉ¬≤): {fitted_sigma2_adam:.3f} (Target: {SIGMA2_TRUE})")
print(f"  * Final -LL Value: {final_loss_adam.item():.2f}")
print(f"  * Optimization Steps: {ADAM_ITERATIONS} epochs")
print("="*50)

--- Starting Data Generation ---
--- Data Generation Complete ---

--- A. Starting MLE Optimization (PyTorch L-BFGS) ---
LBFGS Step 5/50, NLL: 1706.40, Sigma2: 28.792, Grad: -0.0000
LBFGS Step 10/50, NLL: 1706.40, Sigma2: 28.792, Grad: -0.0000
LBFGS Step 15/50, NLL: 1706.40, Sigma2: 28.792, Grad: -0.0000
LBFGS Step 20/50, NLL: 1706.40, Sigma2: 28.792, Grad: -0.0000
LBFGS Step 25/50, NLL: 1706.40, Sigma2: 28.792, Grad: -0.0000
LBFGS Step 30/50, NLL: 1706.40, Sigma2: 28.792, Grad: -0.0000
LBFGS Step 35/50, NLL: 1706.40, Sigma2: 28.792, Grad: -0.0000
LBFGS Step 40/50, NLL: 1706.40, Sigma2: 28.792, Grad: -0.0000
LBFGS Step 45/50, NLL: 1706.40, Sigma2: 28.792, Grad: -0.0000
LBFGS Step 50/50, NLL: 1706.40, Sigma2: 28.792, Grad: -0.0000

--- B. Starting MLE Optimization (PyTorch Adam) ---
Adam Epoch 50/500, NLL: 1706.66, Sigma2: 30.132, Grad: 3.9830
Adam Epoch 100/500, NLL: 1706.40, Sigma2: 28.767, Grad: -0.0630
Adam Epoch 150/500, NLL: 1706.40, Sigma2: 28.790, Grad: -0.0059
Adam Epoch 200/50

# LBFGS VS ADAMS FOR RANGE

In [142]:
import torch
import numpy as np
import torch.optim as optim
from scipy.spatial.distance import cdist
# Removed 'scipy.optimize as opt' and 'from numpy.linalg import inv'

# --- 0. Global Parameters and Utility Functions ---
N_SPATIAL_POINTS = 1120
N_DAYS = 31
N_HOURS_PER_DAY = 8
N_FEATURES = 4
GRID_X = 40  
GRID_Y = 28  
LAT_MIN, LAT_MAX = 0, 5
LON_MIN, LON_MAX = 113, 123
BASE_DATE = '2024_07_y24m07day' 

# Exponential Kernel Parameters (Targets)
SIGMA2_TRUE = 42.0 # FIXED Variance (Used as target and fixed value)
RANGE_A_TRUE = 2.0    # TARGET Range
NUGGET = 0.0     # FIXED Nugget

# Optimization Setup
ADAM_ITERATIONS = 500
ADAM_LEARNING_RATE = 0.01

# L-BFGS Setup
LBFGS_MAX_STEPS = 50 
LBFGS_MAX_EVAL = 50 

OZONE_MEAN = 240.0
# üí° FIXED PARAMETER: Use the true value for the fixed parameter
SIGMA2_FIXED = SIGMA2_TRUE 

# --- COVARIANCE FUNCTIONS ---

def exponential_covariance_numpy(distances, sigma2, a, nugget):
    """Exponential covariance function (NumPy for Generation)."""
    # Cov(h) = sigma^2 * exp(-h/a)
    cov = sigma2 * np.exp(-distances / a)
    if distances.shape[0] == distances.shape[1]:
        cov[np.diag_indices_from(distances)] += (nugget + 1e-6)
    return cov

def exponential_covariance_torch(distances_torch, sigma2, a, nugget):
    """Exponential covariance function (PyTorch for Optimization)."""
    cov = sigma2 * torch.exp(-distances_torch / a)
    
    # Add nugget effect + jitter to the diagonal
    if distances_torch.shape[0] == distances_torch.shape[1]:
        jitter = 1e-6 
        diag_mask = torch.eye(cov.shape[0], device=cov.device)
        cov = cov + diag_mask * (nugget + jitter)
    return cov

# --- SHARED NLL Function for PyTorch (Adam and LBFGS) ---
# üí° MODIFIED: params is now raw_params_range_a. fixed_sigma2 replaces raw_params_sigma2
def neg_log_likelihood_torch(raw_params_range_a, distances_torch, z_centered_torch, fixed_sigma2, fixed_nugget):
    """Calculates -LL for PyTorch (optimizing RANGE only, fixing SIGMA2)."""
    
    # Reparameterization: a = raw_range_a^2
    range_a = raw_params_range_a.pow(2).squeeze()
    
    C = exponential_covariance_torch(distances_torch, fixed_sigma2, range_a, fixed_nugget)
    
    try:
        # Use PyTorch linear algebra functions: Cholesky decomposition
        L = torch.linalg.cholesky(C)
        log_det = 2.0 * torch.sum(torch.log(torch.diag(L)))
        alpha = torch.linalg.solve(C, z_centered_torch.unsqueeze(1))
        quad_term = z_centered_torch.unsqueeze(0) @ alpha
        neg_LL = 0.5 * log_det + 0.5 * quad_term.squeeze()
        return neg_LL
    except RuntimeError:
        return torch.tensor(1e15, device=C.device)


# --- Data Generation Function ---
def generate_ozone_data_map(coords, sigma2, a, nugget, mean, time_index):
    n_points = coords.shape[0]
    distances = cdist(coords, coords, metric='euclidean')
    Cov = exponential_covariance_numpy(distances, sigma2, a, nugget) 
    Cov = (Cov + Cov.T) / 2
    
    try:
        L = np.linalg.cholesky(Cov)
    except np.linalg.LinAlgError:
        return np.zeros((n_points, N_FEATURES))

    W = np.random.normal(0, 1, size=(n_points, 1))
    Z_correlated = L @ W
    ozone_values = mean + Z_correlated
    
    data_np = np.zeros((n_points, N_FEATURES))
    data_np[:, 0:1] = ozone_values             
    data_np[:, 1] = coords[:, 1] * 10 + 2      
    data_np[:, 2] = coords[:, 0] * 40 + 250    
    data_np[:, 3] = time_index                 
    return data_np


# --- 1. Data Generation Execution ---
df_day_aggregated_list = []
print("--- Starting Data Generation ---")
lat_coords = np.linspace(LAT_MIN, LAT_MAX, GRID_Y)
lon_coords = np.linspace(LON_MIN, LON_MAX, GRID_X)
coords_latlon = np.array([[lat, lon] for lat in lat_coords for lon in lon_coords])

# Generate only one hour of data for fitting the spatial model
data_np = generate_ozone_data_map(
    coords_latlon, SIGMA2_TRUE, RANGE_A_TRUE, NUGGET, OZONE_MEAN, 21.0
)
df_day_aggregated_list.append(torch.tensor(data_np, dtype=torch.float))

print("--- Data Generation Complete ---")

# --- 2. Data Preparation ---
data_to_fit = df_day_aggregated_list[0][:N_SPATIAL_POINTS, :] 
z_data = data_to_fit[:, 0].numpy()
coordinates = coords_latlon[:, [1, 0]] 
distances_np = cdist(coordinates, coordinates, metric='euclidean')
z_centered_np = z_data - np.mean(z_data)

# Convert to Torch Tensors
distances_torch = torch.tensor(distances_np, dtype=torch.float)
z_centered_torch = torch.tensor(z_centered_np, dtype=torch.float)

# --- Initial Parameter Setup (Shared) ---
# üí° MODIFIED: Initial guess for raw_range_a: sqrt(target + 0.5) = sqrt(2.5)
raw_range_a_start = np.sqrt(RANGE_A_TRUE + 2) 

# ----------------------------------------------------
# A. Optimization with L-BFGS (PyTorch)
# ----------------------------------------------------

# Reset parameters for LBFGS
raw_params_lbfgs = torch.tensor(
    [raw_range_a_start], 
    dtype=torch.float, 
    requires_grad=True
)

optimizer_lbfgs = optim.LBFGS(
    [raw_params_lbfgs], 
    lr=1.0, 
    max_iter=LBFGS_MAX_STEPS,
    max_eval=LBFGS_MAX_EVAL 
)

final_loss_lbfgs = torch.tensor(0.0)
print("\n--- A. Starting MLE Optimization (PyTorch L-BFGS) ---")
print(f"Fixed Variance (œÉ¬≤): {SIGMA2_FIXED:.1f}")

# L-BFGS requires a "closure" function
def closure_lbfgs():
    optimizer_lbfgs.zero_grad()
    # üí° MODIFIED: Pass fixed_sigma2 instead of raw_params_sigma2
    loss = neg_log_likelihood_torch(raw_params_lbfgs, distances_torch, z_centered_torch, SIGMA2_FIXED, NUGGET)
    if not torch.isinf(loss) and not torch.isnan(loss):
        loss.backward()
    return loss

# L-BFGS Optimization Loop
for step in range(LBFGS_MAX_STEPS):
    loss = optimizer_lbfgs.step(closure_lbfgs)
    final_loss_lbfgs = loss
    
    if (step + 1) % 5 == 0: 
        current_a = raw_params_lbfgs.pow(2).item()
        grad_value = raw_params_lbfgs.grad.item() if raw_params_lbfgs.grad is not None else 0.0
        print(f"LBFGS Step {step + 1}/{LBFGS_MAX_STEPS}, NLL: {loss.item():.2f}, Range_a: {current_a:.3f}, Grad: {grad_value:.4f}")

# ----------------------------------------------------
# B. Optimization with Adam (PyTorch)
# ----------------------------------------------------

# Reset parameters for Adam (Use the same start point)
raw_params_adam = torch.tensor(
    [raw_range_a_start], 
    dtype=torch.float, 
    requires_grad=True
)

optimizer_adam = optim.Adam(
    [raw_params_adam], 
    lr=ADAM_LEARNING_RATE
)

final_loss_adam = torch.tensor(0.0)
print(f"\n--- B. Starting MLE Optimization (PyTorch Adam) ---")
print(f"Fixed Variance (œÉ¬≤): {SIGMA2_FIXED:.1f}")

# Adam Optimization Loop
for epoch in range(ADAM_ITERATIONS):
    optimizer_adam.zero_grad()
    
    # üí° MODIFIED: Pass fixed_sigma2 instead of raw_params_sigma2
    loss = neg_log_likelihood_torch(raw_params_adam, distances_torch, z_centered_torch, SIGMA2_FIXED, NUGGET)
    
    if torch.isinf(loss) or torch.isnan(loss):
        loss = torch.tensor(1e15, device=loss.device)
        break

    loss.backward()
    optimizer_adam.step()
    final_loss_adam = loss
    
    if (epoch + 1) % 50 == 0: 
        current_a = raw_params_adam.pow(2).item()
        grad_value = raw_params_adam.grad.item() if raw_params_adam.grad is not None else 0.0
        print(f"Adam Epoch {epoch + 1}/{ADAM_ITERATIONS}, NLL: {loss.item():.2f}, Range_a: {current_a:.3f}, Grad: {grad_value:.4f}")

# ----------------------------------------------------
# 3. Display Results
# ----------------------------------------------------
print("\n" + "="*50)
print(f"TARGET PARAMETERS: Variance (œÉ¬≤)={SIGMA2_TRUE}, Range (a)={RANGE_A_TRUE}, Nugget (Œ∑¬≤)={NUGGET}")
print(f"**FIXED PARAMETER: Variance (œÉ¬≤)={SIGMA2_FIXED}**")
print("="*50)

# L-BFGS Results
fitted_range_a_lbfgs = raw_params_lbfgs.pow(2).detach().numpy().item()
print("‚ú® PyTorch L-BFGS Results:")
print(f"  * Fitted Range (a): {fitted_range_a_lbfgs:.3f} (Target: {RANGE_A_TRUE})")
print(f"  * Final -LL Value: {final_loss_lbfgs.item():.2f}")
print(f"  * Optimization Steps: {LBFGS_MAX_STEPS} steps")

# Adam Results
fitted_range_a_adam = raw_params_adam.pow(2).detach().numpy().item()
print("\nüöÄ PyTorch Adam Results:")
print(f"  * Fitted Range (a): {fitted_range_a_adam:.3f} (Target: {RANGE_A_TRUE})")
print(f"  * Final -LL Value: {final_loss_adam.item():.2f}")
print(f"  * Optimization Steps: {ADAM_ITERATIONS} epochs")
print("="*50)

--- Starting Data Generation ---
--- Data Generation Complete ---

--- A. Starting MLE Optimization (PyTorch L-BFGS) ---
Fixed Variance (œÉ¬≤): 42.0
LBFGS Step 5/50, NLL: 1484.49, Range_a: 1.955, Grad: 0.0000
LBFGS Step 10/50, NLL: 1484.49, Range_a: 1.955, Grad: 0.0000
LBFGS Step 15/50, NLL: 1484.49, Range_a: 1.955, Grad: 0.0000
LBFGS Step 20/50, NLL: 1484.49, Range_a: 1.955, Grad: 0.0000
LBFGS Step 25/50, NLL: 1484.49, Range_a: 1.955, Grad: 0.0000
LBFGS Step 30/50, NLL: 1484.49, Range_a: 1.955, Grad: 0.0000
LBFGS Step 35/50, NLL: 1484.49, Range_a: 1.955, Grad: 0.0000
LBFGS Step 40/50, NLL: 1484.49, Range_a: 1.955, Grad: 0.0000
LBFGS Step 45/50, NLL: 1484.49, Range_a: 1.955, Grad: 0.0000
LBFGS Step 50/50, NLL: 1484.49, Range_a: 1.955, Grad: 0.0000

--- B. Starting MLE Optimization (PyTorch Adam) ---
Fixed Variance (œÉ¬≤): 42.0
Adam Epoch 50/500, NLL: 1500.56, Range_a: 2.448, Grad: 182.9287
Adam Epoch 100/500, NLL: 1484.51, Range_a: 1.971, Grad: 7.2815
Adam Epoch 150/500, NLL: 1484.49, 

# lbfgs vs adams fit both  sigma^2 and range

In [180]:
import torch
import numpy as np
import torch.optim as optim
from scipy.spatial.distance import cdist
# Removed 'scipy.optimize as opt' and 'from numpy.linalg import inv'

# --- 0. Global Parameters and Utility Functions ---
N_SPATIAL_POINTS = 1120
N_DAYS = 31
N_HOURS_PER_DAY = 8
N_FEATURES = 4
GRID_X = 40  
GRID_Y = 28  
LAT_MIN, LAT_MAX = 0, 5
LON_MIN, LON_MAX = 113, 123
BASE_DATE = '2024_07_y24m07day' 

# Exponential Kernel Parameters (Targets)
SIGMA2_TRUE = 42.0 # TARGET Variance
RANGE_A_TRUE = 2.0    # TARGET Range
NUGGET = 0.0     # FIXED Nugget

# Optimization Setup
ADAM_ITERATIONS = 500
ADAM_LEARNING_RATE = 0.01

# L-BFGS Setup
LBFGS_MAX_STEPS = 50 
LBFGS_MAX_EVAL = 50 

OZONE_MEAN = 240.0

# --- COVARIANCE FUNCTIONS ---

def exponential_covariance_numpy(distances, sigma2, a, nugget):
    """Exponential covariance function (NumPy for Generation)."""
    # Cov(h) = sigma^2 * exp(-h/a)
    cov = sigma2 * np.exp(-distances / a)
    if distances.shape[0] == distances.shape[1]:
        cov[np.diag_indices_from(distances)] += (nugget + 1e-6)
    return cov

def exponential_covariance_torch(distances_torch, sigma2, a, nugget):
    """Exponential covariance function (PyTorch for Optimization)."""
    cov = sigma2 * torch.exp(-distances_torch / a)
    
    # Add nugget effect + jitter to the diagonal
    if distances_torch.shape[0] == distances_torch.shape[1]:
        jitter = 1e-6 
        diag_mask = torch.eye(cov.shape[0], device=cov.device)
        cov = cov + diag_mask * (nugget + jitter)
    return cov

# --- SHARED NLL Function for PyTorch (Adam and LBFGS) ---
# üí° MODIFIED: Accepts raw_params_all which contains [raw_sigma2, raw_range_a]
def neg_log_likelihood_torch(raw_params_all, distances_torch, z_centered_torch, fixed_nugget):
    """Calculates -LL for PyTorch (optimizing SIGMA2 and RANGE)."""
    
    # Reparameterization: sigma2 = raw_sigma2^2, range_a = raw_range_a^2
    sigma2 = raw_params_all[0].pow(2).squeeze()
    range_a = raw_params_all[1].pow(2).squeeze()
    
    C = exponential_covariance_torch(distances_torch, sigma2, range_a, fixed_nugget)
    
    try:
        # Use PyTorch linear algebra functions: Cholesky decomposition
        L = torch.linalg.cholesky(C)
        log_det = 2.0 * torch.sum(torch.log(torch.diag(L)))
        alpha = torch.linalg.solve(C, z_centered_torch.unsqueeze(1))
        quad_term = z_centered_torch.unsqueeze(0) @ alpha
        neg_LL = 0.5 * log_det + 0.5 * quad_term.squeeze()
        return neg_LL
    except RuntimeError:
        return torch.tensor(1e15, device=C.device)


# --- Data Generation Function (Unchanged) ---
def generate_ozone_data_map(coords, sigma2, a, nugget, mean, time_index):
    n_points = coords.shape[0]
    distances = cdist(coords, coords, metric='euclidean')
    Cov = exponential_covariance_numpy(distances, sigma2, a, nugget) 
    Cov = (Cov + Cov.T) / 2
    
    try:
        L = np.linalg.cholesky(Cov)
    except np.linalg.LinAlgError:
        return np.zeros((n_points, N_FEATURES))

    W = np.random.normal(0, 1, size=(n_points, 1))
    Z_correlated = L @ W
    ozone_values = mean + Z_correlated
    
    data_np = np.zeros((n_points, N_FEATURES))
    data_np[:, 0:1] = ozone_values             
    data_np[:, 1] = coords[:, 1] * 10 + 2      
    data_np[:, 2] = coords[:, 0] * 40 + 250    
    data_np[:, 3] = time_index                 
    return data_np


# --- 1. Data Generation Execution ---
df_day_aggregated_list = []
print("--- Starting Data Generation ---")
lat_coords = np.linspace(LAT_MIN, LAT_MAX, GRID_Y)
lon_coords = np.linspace(LON_MIN, LON_MAX, GRID_X)
coords_latlon = np.array([[lat, lon] for lat in lat_coords for lon in lon_coords])

# Generate only one hour of data for fitting the spatial model
data_np = generate_ozone_data_map(
    coords_latlon, SIGMA2_TRUE, RANGE_A_TRUE, NUGGET, OZONE_MEAN, 21.0
)
df_day_aggregated_list.append(torch.tensor(data_np, dtype=torch.float))

print("--- Data Generation Complete ---")

# --- 2. Data Preparation ---
data_to_fit = df_day_aggregated_list[0][:N_SPATIAL_POINTS, :] 
z_data = data_to_fit[:, 0].numpy()
coordinates = coords_latlon[:, [1, 0]] 
distances_np = cdist(coordinates, coordinates, metric='euclidean')
z_centered_np = z_data - np.mean(z_data)

# Convert to Torch Tensors
distances_torch = torch.tensor(distances_np, dtype=torch.float)
z_centered_torch = torch.tensor(z_centered_np, dtype=torch.float)

# --- Initial Parameter Setup (Shared) ---
# üí° NEW: Initial guess for raw_sigma2 and raw_range_a. Starting off target for both.
raw_sigma2_start = np.sqrt(SIGMA2_TRUE + 5)     # Target: sqrt(42) ‚âà 6.48. Start: sqrt(47) ‚âà 6.86
raw_range_a_start = np.sqrt(RANGE_A_TRUE + 0.5) # Target: sqrt(2) ‚âà 1.41. Start: sqrt(2.5) ‚âà 1.58

initial_params = [raw_sigma2_start, raw_range_a_start]


# ----------------------------------------------------
# A. Optimization with L-BFGS (PyTorch)
# ----------------------------------------------------

# Reset parameters for LBFGS
raw_params_lbfgs = torch.tensor(
    initial_params, 
    dtype=torch.float, 
    requires_grad=True
)

optimizer_lbfgs = optim.LBFGS(
    [raw_params_lbfgs], 
    lr=1.0, 
    max_iter=LBFGS_MAX_STEPS,
    max_eval=LBFGS_MAX_EVAL 
)

final_loss_lbfgs = torch.tensor(0.0)
print("\n--- A. Starting MLE Optimization (PyTorch L-BFGS) ---")

# L-BFGS requires a "closure" function
def closure_lbfgs():
    optimizer_lbfgs.zero_grad()
    # üí° MODIFIED: Pass raw_params_lbfgs and fixed_nugget
    loss = neg_log_likelihood_torch(raw_params_lbfgs, distances_torch, z_centered_torch, NUGGET)
    if not torch.isinf(loss) and not torch.isnan(loss):
        loss.backward()
    return loss

# L-BFGS Optimization Loop
for step in range(LBFGS_MAX_STEPS):
    loss = optimizer_lbfgs.step(closure_lbfgs)
    final_loss_lbfgs = loss
    
    if (step + 1) % 5 == 0: 
        current_sigma2 = raw_params_lbfgs[0].pow(2).item()
        current_a = raw_params_lbfgs[1].pow(2).item()
        grad_sigma2 = raw_params_lbfgs.grad[0].item() if raw_params_lbfgs.grad is not None else 0.0
        grad_a = raw_params_lbfgs.grad[1].item() if raw_params_lbfgs.grad is not None else 0.0
        print(f"LBFGS Step {step + 1}/{LBFGS_MAX_STEPS}, NLL: {loss.item():.2f}, Params: [œÉ¬≤: {current_sigma2:.3f}, a: {current_a:.3f}], Grads: [{grad_sigma2:.4f}, {grad_a:.4f}]")

# ----------------------------------------------------
# B. Optimization with Adam (PyTorch)
# ----------------------------------------------------

# Reset parameters for Adam (Use the same start point)
raw_params_adam = torch.tensor(
    initial_params, 
    dtype=torch.float, 
    requires_grad=True
)

optimizer_adam = optim.Adam(
    [raw_params_adam], 
    lr=ADAM_LEARNING_RATE
)

final_loss_adam = torch.tensor(0.0)
print(f"\n--- B. Starting MLE Optimization (PyTorch Adam) ---")

# Adam Optimization Loop
for epoch in range(ADAM_ITERATIONS):
    optimizer_adam.zero_grad()
    
    # üí° MODIFIED: Pass raw_params_adam and fixed_nugget
    loss = neg_log_likelihood_torch(raw_params_adam, distances_torch, z_centered_torch, NUGGET)
    
    if torch.isinf(loss) or torch.isnan(loss):
        loss = torch.tensor(1e15, device=loss.device)
        break

    loss.backward()
    optimizer_adam.step()
    final_loss_adam = loss
    
    if (epoch + 1) % 50 == 0: 
        current_sigma2 = raw_params_adam[0].pow(2).item()
        current_a = raw_params_adam[1].pow(2).item()
        grad_sigma2 = raw_params_adam.grad[0].item() if raw_params_adam.grad is not None else 0.0
        grad_a = raw_params_adam.grad[1].item() if raw_params_adam.grad is not None else 0.0
        print(f"Adam Epoch {epoch + 1}/{ADAM_ITERATIONS}, NLL: {loss.item():.2f}, Params: [œÉ¬≤: {current_sigma2:.3f}, a: {current_a:.3f}], Grads: [{grad_sigma2:.4f}, {grad_a:.4f}]")

# ----------------------------------------------------
# 3. Display Results
# ----------------------------------------------------
print("\n" + "="*50)
print(f"TARGET PARAMETERS: Variance (œÉ¬≤)={SIGMA2_TRUE}, Range (a)={RANGE_A_TRUE}, Nugget (Œ∑¬≤)={NUGGET}")
print("="*50)

# L-BFGS Results
fitted_sigma2_lbfgs = raw_params_lbfgs[0].pow(2).detach().numpy().item()
fitted_range_a_lbfgs = raw_params_lbfgs[1].pow(2).detach().numpy().item()
print("‚ú® PyTorch L-BFGS Results:")
print(f"  * Fitted Variance (œÉ¬≤): {fitted_sigma2_lbfgs:.3f} (Target: {SIGMA2_TRUE})")
print(f"  * Fitted Range (a): {fitted_range_a_lbfgs:.3f} (Target: {RANGE_A_TRUE})")
print(f"  * Final -LL Value: {final_loss_lbfgs.item():.2f}")
print(f"  * Optimization Steps: {LBFGS_MAX_STEPS} steps")

# Adam Results
fitted_sigma2_adam = raw_params_adam[0].pow(2).detach().numpy().item()
fitted_range_a_adam = raw_params_adam[1].pow(2).detach().numpy().item()
print("\nüöÄ PyTorch Adam Results:")
print(f"  * Fitted Variance (œÉ¬≤): {fitted_sigma2_adam:.3f} (Target: {SIGMA2_TRUE})")
print(f"  * Fitted Range (a): {fitted_range_a_adam:.3f} (Target: {RANGE_A_TRUE})")
print(f"  * Final -LL Value: {final_loss_adam.item():.2f}")
print(f"  * Optimization Steps: {ADAM_ITERATIONS} epochs")
print("="*50)

--- Starting Data Generation ---
--- Data Generation Complete ---

--- A. Starting MLE Optimization (PyTorch L-BFGS) ---
LBFGS Step 5/50, NLL: 1444.80, Params: [œÉ¬≤: 32.857, a: 1.622], Grads: [-0.0006, 0.0027]
LBFGS Step 10/50, NLL: 1444.80, Params: [œÉ¬≤: 32.857, a: 1.622], Grads: [-0.0006, 0.0027]
LBFGS Step 15/50, NLL: 1444.80, Params: [œÉ¬≤: 32.857, a: 1.622], Grads: [-0.0006, 0.0027]
LBFGS Step 20/50, NLL: 1444.80, Params: [œÉ¬≤: 32.857, a: 1.622], Grads: [-0.0006, 0.0027]
LBFGS Step 25/50, NLL: 1444.80, Params: [œÉ¬≤: 32.857, a: 1.622], Grads: [-0.0006, 0.0027]
LBFGS Step 30/50, NLL: 1444.80, Params: [œÉ¬≤: 32.857, a: 1.622], Grads: [-0.0006, 0.0027]
LBFGS Step 35/50, NLL: 1444.80, Params: [œÉ¬≤: 32.857, a: 1.622], Grads: [-0.0006, 0.0027]
LBFGS Step 40/50, NLL: 1444.80, Params: [œÉ¬≤: 32.857, a: 1.622], Grads: [-0.0006, 0.0027]
LBFGS Step 45/50, NLL: 1444.80, Params: [œÉ¬≤: 32.857, a: 1.622], Grads: [-0.0006, 0.0027]
LBFGS Step 50/50, NLL: 1444.80, Params: [œÉ¬≤: 32.857, a: 1.6

# square parameteriazation + stablize parametrization

In [181]:
import torch
import numpy as np
import torch.optim as optim
from scipy.spatial.distance import cdist

# --- 0. Global Parameters and Utility Functions ---
# CHANGED: Increased grid resolution to increase data points (40*28 -> 80*56)
GRID_X = 80  
GRID_Y = 56  
N_SPATIAL_POINTS = GRID_X * GRID_Y # New total: 4480 points
N_DAYS = 31
N_HOURS_PER_DAY = 8
N_FEATURES = 4
LAT_MIN, LAT_MAX = 0, 5
LON_MIN, LON_MAX = 113, 123
BASE_DATE = '2024_07_y24m07day' 

# Exponential Kernel Parameters (Targets)
SIGMA2_TRUE = 42.0 # TARGET Variance
RANGE_A_TRUE = 2   # TARGET Range
NUGGET = 0.0     # FIXED Nugget

# Optimization Setup
ADAM_ITERATIONS = 500
ADAM_LEARNING_RATE = 0.01

# L-BFGS Setup
LBFGS_MAX_STEPS = 50 
LBFGS_MAX_EVAL = 50 

OZONE_MEAN = 240.0

# Calculate target stable parameters for display
PHI1_TARGET = SIGMA2_TRUE / RANGE_A_TRUE    # 42.0 / 1.5 = 28.0
PHI2_TARGET = 1.0 / RANGE_A_TRUE            # 1.0 / 1.5 = 0.667

# --- COVARIANCE FUNCTIONS ---

def exponential_covariance_numpy(distances, sigma2, a, nugget):
    """Exponential covariance function (NumPy for Generation)."""
    # Cov(h) = sigma^2 * exp(-h/a)
    cov = sigma2 * np.exp(-distances / a)
    if distances.shape[0] == distances.shape[1]:
        cov[np.diag_indices_from(distances)] += (nugget + 1e-6)
    return cov

def exponential_covariance_torch(distances_torch, sigma2, a, nugget):
    """Exponential covariance function (PyTorch for Optimization)."""
    cov = sigma2 * torch.exp(-distances_torch / a)
    
    # Add nugget effect + jitter to the diagonal
    if distances_torch.shape[0] == distances_torch.shape[1]:
        jitter = 1e-6 
        diag_mask = torch.eye(cov.shape[0], device=cov.device)
        cov = cov + diag_mask * (nugget + jitter)
    return cov

# --- SHARED NLL Function using STABLE REPARAMETERIZATION ---
def neg_log_likelihood_torch_stable(raw_params_phi, distances_torch, z_centered_torch, fixed_nugget):
    """
    Calculates -LL for PyTorch (optimizing Stable Reparameterization).
    raw_params_phi[0] = raw_phi1_sqrt (for sigma2/a)
    raw_params_phi[1] = raw_phi2_sqrt (for 1/a)
    """
    
    # 1. Apply Square Reparameterization to raw parameters to ensure positivity
    phi1 = raw_params_phi[0].pow(2).squeeze() # Phi1 = sigma2 / a (TARGET RATIO)
    phi2 = raw_params_phi[1].pow(2).squeeze() # Phi2 = 1 / a (INVERSE RANGE)
    
    epsilon = 1e-6
    
    # 2. Derive Original Parameters
    range_a = 1.0 / (phi2 + epsilon)          # Range: a = 1 / Phi2
    sigma2 = phi1 / (phi2 + epsilon)          # Variance: sigma2 = Phi1 / Phi2
    
    C = exponential_covariance_torch(distances_torch, sigma2, range_a, fixed_nugget)
    
    try:
        # Use PyTorch linear algebra functions: Cholesky decomposition
        L = torch.linalg.cholesky(C)
        log_det = 2.0 * torch.sum(torch.log(torch.diag(L)))
        alpha = torch.linalg.solve(C, z_centered_torch.unsqueeze(1))
        quad_term = z_centered_torch.unsqueeze(0) @ alpha
        neg_LL = 0.5 * log_det + 0.5 * quad_term.squeeze()
        return neg_LL
    except RuntimeError:
        return torch.tensor(1e15, device=C.device)


# --- Data Generation Function (Unchanged) ---
def generate_ozone_data_map(coords, sigma2, a, nugget, mean, time_index):
    n_points = coords.shape[0]
    distances = cdist(coords, coords, metric='euclidean')
    Cov = exponential_covariance_numpy(distances, sigma2, a, nugget) 
    Cov = (Cov + Cov.T) / 2
    
    try:
        L = np.linalg.cholesky(Cov)
    except np.linalg.LinAlgError:
        return np.zeros((n_points, N_FEATURES))

    W = np.random.normal(0, 1, size=(n_points, 1))
    Z_correlated = L @ W
    ozone_values = mean + Z_correlated
    
    data_np = np.zeros((n_points, N_FEATURES))
    data_np[:, 0:1] = ozone_values             
    data_np[:, 1] = coords[:, 1] * 10 + 2      
    data_np[:, 2] = coords[:, 0] * 40 + 250    
    data_np[:, 3] = time_index                 
    return data_np


# --- 1. Data Generation Execution ---
df_day_aggregated_list = []
print(f"--- Starting Data Generation ({N_SPATIAL_POINTS} points) ---")
lat_coords = np.linspace(LAT_MIN, LAT_MAX, GRID_Y)
lon_coords = np.linspace(LON_MIN, LON_MAX, GRID_X)
coords_latlon = np.array([[lat, lon] for lat in lat_coords for lon in lon_coords])

# Generate only one hour of data for fitting the spatial model
data_np = generate_ozone_data_map(
    coords_latlon, SIGMA2_TRUE, RANGE_A_TRUE, NUGGET, OZONE_MEAN, 21.0
)
df_day_aggregated_list.append(torch.tensor(data_np, dtype=torch.float))

print("--- Data Generation Complete ---")

# --- 2. Data Preparation ---
# NOTE: This slice now takes the first 4480 points (all of them)
data_to_fit = df_day_aggregated_list[0][:N_SPATIAL_POINTS, :] 
z_data = data_to_fit[:, 0].numpy()
coordinates = coords_latlon[:, [1, 0]] 
distances_np = cdist(coordinates, coordinates, metric='euclidean')
z_centered_np = z_data - np.mean(z_data)

# Convert to Torch Tensors
distances_torch = torch.tensor(distances_np, dtype=torch.float)
z_centered_torch = torch.tensor(z_centered_np, dtype=torch.float)

# --- Initial Parameter Setup (Shared) ---
# Start Phi1 (Target 28.0) at 32.0 (sqrt 5.66)
raw_phi1_sqrt_start = np.sqrt(PHI1_TARGET + 4.0) 
# Start Phi2 (Target 0.667) at 0.8 (sqrt 0.64)
raw_phi2_sqrt_start = np.sqrt(PHI2_TARGET + 0.133) 

initial_params_stable = [raw_phi1_sqrt_start, raw_phi2_sqrt_start]


# ----------------------------------------------------
# A. Optimization with L-BFGS (PyTorch) - STABLE
# ----------------------------------------------------

# Reset parameters for LBFGS
raw_params_lbfgs = torch.tensor(
    initial_params_stable, 
    dtype=torch.float, 
    requires_grad=True
)

optimizer_lbfgs = optim.LBFGS(
    [raw_params_lbfgs], 
    lr=1.0, 
    max_iter=LBFGS_MAX_STEPS,
    max_eval=LBFGS_MAX_EVAL 
)

final_loss_lbfgs = torch.tensor(0.0)
print("\n--- A. Starting MLE Optimization (PyTorch L-BFGS) - STABLE ---")

# L-BFGS requires a "closure" function
def closure_lbfgs():
    optimizer_lbfgs.zero_grad()
    # Use the stable NLL function
    loss = neg_log_likelihood_torch_stable(raw_params_lbfgs, distances_torch, z_centered_torch, NUGGET)
    if not torch.isinf(loss) and not torch.isnan(loss):
        loss.backward()
    return loss

# L-BFGS Optimization Loop
for step in range(LBFGS_MAX_STEPS):
    loss = optimizer_lbfgs.step(closure_lbfgs)
    final_loss_lbfgs = loss
    
    if (step + 1) % 5 == 0: 
        phi1 = raw_params_lbfgs[0].pow(2).item()
        phi2 = raw_params_lbfgs[1].pow(2).item()
        current_sigma2 = phi1 / (phi2 + 1e-6)
        current_a = 1.0 / (phi2 + 1e-6)
        grad_phi1 = raw_params_lbfgs.grad[0].item() if raw_params_lbfgs.grad is not None else 0.0
        grad_phi2 = raw_params_lbfgs.grad[1].item() if raw_params_lbfgs.grad is not None else 0.0
        print(f"LBFGS Step {step + 1}/{LBFGS_MAX_STEPS}, NLL: {loss.item():.2f}, Params: [œÉ¬≤: {current_sigma2:.3f}, a: {current_a:.3f}], Grads: [Œ¶1_raw: {grad_phi1:.4f}, Œ¶2_raw: {grad_phi2:.4f}]")

# ----------------------------------------------------
# B. Optimization with Adam (PyTorch) - STABLE
# ----------------------------------------------------

# Reset parameters for Adam (Use the same start point)
raw_params_adam = torch.tensor(
    initial_params_stable, 
    dtype=torch.float, 
    requires_grad=True
)

optimizer_adam = optim.Adam(
    [raw_params_adam], 
    lr=ADAM_LEARNING_RATE
)

final_loss_adam = torch.tensor(0.0)
print(f"\n--- B. Starting MLE Optimization (PyTorch Adam) - STABLE ---")

# Adam Optimization Loop
for epoch in range(ADAM_ITERATIONS):
    optimizer_adam.zero_grad()
    
    # Use the stable NLL function
    loss = neg_log_likelihood_torch_stable(raw_params_adam, distances_torch, z_centered_torch, NUGGET)
    
    if torch.isinf(loss) or torch.isnan(loss):
        loss = torch.tensor(1e15, device=loss.device)
        break

    loss.backward()
    optimizer_adam.step()
    final_loss_adam = loss
    
    if (epoch + 1) % 50 == 0: 
        phi1 = raw_params_adam[0].pow(2).item()
        phi2 = raw_params_adam[1].pow(2).item()
        current_sigma2 = phi1 / (phi2 + 1e-6)
        current_a = 1.0 / (phi2 + 1e-6)
        grad_phi1 = raw_params_adam.grad[0].item() if raw_params_adam.grad is not None else 0.0
        grad_phi2 = raw_params_adam.grad[1].item() if raw_params_adam.grad is not None else 0.0
        print(f"Adam Epoch {epoch + 1}/{ADAM_ITERATIONS}, NLL: {loss.item():.2f}, Params: [œÉ¬≤: {current_sigma2:.3f}, a: {current_a:.3f}], Grads: [Œ¶1_raw: {grad_phi1:.4f}, Œ¶2_raw: {grad_phi2:.4f}]")

# ----------------------------------------------------
# 3. Display Results
# ----------------------------------------------------
print("\n" + "="*50)
print(f"TARGET PARAMETERS: Variance (œÉ¬≤)={SIGMA2_TRUE}, Range (a)={RANGE_A_TRUE}, Nugget (Œ∑¬≤)={NUGGET}")
print("="*50)

# L-BFGS Results
phi1_lbfgs = raw_params_lbfgs[0].pow(2).detach().numpy().item()
phi2_lbfgs = raw_params_lbfgs[1].pow(2).detach().numpy().item()
fitted_sigma2_lbfgs = phi1_lbfgs / (phi2_lbfgs + 1e-6)
fitted_range_a_lbfgs = 1.0 / (phi2_lbfgs + 1e-6)

print("‚ú® PyTorch L-BFGS Results (Stable Reparameterization):")
print(f"  * Fitted Variance (œÉ¬≤): {fitted_sigma2_lbfgs:.3f} (Target: {SIGMA2_TRUE})")
print(f"  * Fitted Range (a): {fitted_range_a_lbfgs:.3f} (Target: {RANGE_A_TRUE})")
print(f"  * Fitted Ratio (Œ¶‚ÇÅ=œÉ¬≤/a): {phi1_lbfgs:.3f} (Target Ratio: {PHI1_TARGET:.3f})")
print(f"  * Final -LL Value: {final_loss_lbfgs.item():.2f}")
print(f"  * Optimization Steps: {LBFGS_MAX_STEPS} steps")

# Adam Results
phi1_adam = raw_params_adam[0].pow(2).detach().numpy().item()
phi2_adam = raw_params_adam[1].pow(2).detach().numpy().item()
fitted_sigma2_adam = phi1_adam / (phi2_adam + 1e-6)
fitted_range_a_adam = 1.0 / (phi2_adam + 1e-6)

print("\nüöÄ PyTorch Adam Results (Stable Reparameterization):")
print(f"  * Fitted Variance (œÉ¬≤): {fitted_sigma2_adam:.3f} (Target: {SIGMA2_TRUE})")
print(f"  * Fitted Range (a): {fitted_range_a_adam:.3f} (Target: {RANGE_A_TRUE})")
print(f"  * Fitted Ratio (Œ¶‚ÇÅ=œÉ¬≤/a): {phi1_adam:.3f} (Target Ratio: {PHI1_TARGET:.3f})")
print(f"  * Final -LL Value: {final_loss_adam.item():.2f}")
print(f"  * Optimization Steps: {ADAM_ITERATIONS} epochs")
print("="*50)



--- Starting Data Generation (4480 points) ---
--- Data Generation Complete ---

--- A. Starting MLE Optimization (PyTorch L-BFGS) - STABLE ---
LBFGS Step 5/50, NLL: 4230.51, Params: [œÉ¬≤: 54.321, a: 2.632], Grads: [Œ¶1_raw: -0.0019, Œ¶2_raw: -0.0053]
LBFGS Step 10/50, NLL: 4230.51, Params: [œÉ¬≤: 54.321, a: 2.632], Grads: [Œ¶1_raw: -0.0019, Œ¶2_raw: -0.0053]
LBFGS Step 15/50, NLL: 4230.51, Params: [œÉ¬≤: 54.321, a: 2.632], Grads: [Œ¶1_raw: -0.0019, Œ¶2_raw: -0.0053]
LBFGS Step 20/50, NLL: 4230.51, Params: [œÉ¬≤: 54.321, a: 2.632], Grads: [Œ¶1_raw: -0.0019, Œ¶2_raw: -0.0053]
LBFGS Step 25/50, NLL: 4230.51, Params: [œÉ¬≤: 54.321, a: 2.632], Grads: [Œ¶1_raw: -0.0019, Œ¶2_raw: -0.0053]
LBFGS Step 30/50, NLL: 4230.51, Params: [œÉ¬≤: 54.321, a: 2.632], Grads: [Œ¶1_raw: -0.0019, Œ¶2_raw: -0.0053]
LBFGS Step 35/50, NLL: 4230.51, Params: [œÉ¬≤: 54.321, a: 2.632], Grads: [Œ¶1_raw: -0.0019, Œ¶2_raw: -0.0053]
LBFGS Step 40/50, NLL: 4230.51, Params: [œÉ¬≤: 54.321, a: 2.632], Grads: [Œ¶1_raw: -0.0

In [185]:
import torch
import numpy as np
import torch.optim as optim
from scipy.spatial.distance import cdist

# --- 0. Global Parameters and Utility Functions ---
N_SPATIAL_POINTS = 1120
N_DAYS = 31
N_HOURS_PER_DAY = 8
N_FEATURES = 4
GRID_X = 40  
GRID_Y = 28  
LAT_MIN, LAT_MAX = 0, 5
LON_MIN, LON_MAX = 113, 123
BASE_DATE = '2024_07_y24m07day' 

# Exponential Kernel Parameters (Targets)
SIGMA2_TRUE = 25.0 # TARGET Variance
RANGE_A_TRUE = 1    # TARGET Range
NUGGET = 3.0     # FIXED Nugget

# Optimization Setup
ADAM_ITERATIONS = 500
ADAM_LEARNING_RATE = 0.01

# L-BFGS Setup
LBFGS_MAX_STEPS = 50 
LBFGS_MAX_EVAL = 50 

OZONE_MEAN = 240.0

# --- COVARIANCE FUNCTIONS ---

def exponential_covariance_numpy(distances, sigma2, a, nugget):
    """Exponential covariance function (NumPy for Generation)."""
    # Cov(h) = sigma^2 * exp(-h/a)
    cov = sigma2 * np.exp(-distances / a)
    if distances.shape[0] == distances.shape[1]:
        cov[np.diag_indices_from(distances)] += (nugget + 1e-6)
    return cov

def exponential_covariance_torch(distances_torch, sigma2, a, nugget):
    """Exponential covariance function (PyTorch for Optimization)."""
    cov = sigma2 * torch.exp(-distances_torch / a)
    
    # Add nugget effect + jitter to the diagonal
    if distances_torch.shape[0] == distances_torch.shape[1]:
        jitter = 1e-6 
        diag_mask = torch.eye(cov.shape[0], device=cov.device)
        cov = cov + diag_mask * (nugget + jitter)
    return cov

# --- SHARED NLL Function using STABLE REPARAMETERIZATION ---
def neg_log_likelihood_torch_stable(raw_params_phi, distances_torch, z_centered_torch, fixed_nugget):
    """
    Calculates -LL for PyTorch (optimizing Stable Reparameterization).
    raw_params_phi[0] = raw_phi1_sqrt (for sigma2/a)
    raw_params_phi[1] = raw_phi2_sqrt (for 1/a)
    """
    
    # 1. Apply Square Reparameterization to raw parameters to ensure positivity
    phi1 = raw_params_phi[0].pow(2).squeeze() # Phi1 = sigma2 / a (TARGET RATIO)
    phi2 = raw_params_phi[1].pow(2).squeeze() # Phi2 = 1 / a (INVERSE RANGE)
    
    epsilon = 1e-6
    
    # 2. Derive Original Parameters
    range_a = 1.0 / (phi2 + epsilon)          # Range: a = 1 / Phi2
    sigma2 = phi1 / (phi2 + epsilon)          # Variance: sigma2 = Phi1 / Phi2
    
    C = exponential_covariance_torch(distances_torch, sigma2, range_a, fixed_nugget)
    
    try:
        # Use PyTorch linear algebra functions: Cholesky decomposition
        L = torch.linalg.cholesky(C)
        log_det = 2.0 * torch.sum(torch.log(torch.diag(L)))
        alpha = torch.linalg.solve(C, z_centered_torch.unsqueeze(1))
        quad_term = z_centered_torch.unsqueeze(0) @ alpha
        neg_LL = 0.5 * log_det + 0.5 * quad_term.squeeze()
        return neg_LL
    except RuntimeError:
        return torch.tensor(1e15, device=C.device)


# --- Data Generation Function (Unchanged) ---
def generate_ozone_data_map(coords, sigma2, a, nugget, mean, time_index):
    n_points = coords.shape[0]
    distances = cdist(coords, coords, metric='euclidean')
    Cov = exponential_covariance_numpy(distances, sigma2, a, nugget) 
    Cov = (Cov + Cov.T) / 2
    
    try:
        L = np.linalg.cholesky(Cov)
    except np.linalg.LinAlgError:
        return np.zeros((n_points, N_FEATURES))

    W = np.random.normal(0, 1, size=(n_points, 1))
    Z_correlated = L @ W
    ozone_values = mean + Z_correlated
    
    data_np = np.zeros((n_points, N_FEATURES))
    data_np[:, 0:1] = ozone_values             
    data_np[:, 1] = coords[:, 1] * 10 + 2      
    data_np[:, 2] = coords[:, 0] * 40 + 250    
    data_np[:, 3] = time_index                 
    return data_np


# --- 1. Data Generation Execution ---
df_day_aggregated_list = []
print("--- Starting Data Generation ---")
lat_coords = np.linspace(LAT_MIN, LAT_MAX, GRID_Y)
lon_coords = np.linspace(LON_MIN, LON_MAX, GRID_X)
coords_latlon = np.array([[lat, lon] for lat in lat_coords for lon in lon_coords])

# Generate only one hour of data for fitting the spatial model
data_np = generate_ozone_data_map(
    coords_latlon, SIGMA2_TRUE, RANGE_A_TRUE, NUGGET, OZONE_MEAN, 21.0
)
df_day_aggregated_list.append(torch.tensor(data_np, dtype=torch.float))

print("--- Data Generation Complete ---")

# --- 2. Data Preparation ---
data_to_fit = df_day_aggregated_list[0][:N_SPATIAL_POINTS, :] 
z_data = data_to_fit[:, 0].numpy()
coordinates = coords_latlon[:, [1, 0]] 
distances_np = cdist(coordinates, coordinates, metric='euclidean')
z_centered_np = z_data - np.mean(z_data)

# Convert to Torch Tensors
distances_torch = torch.tensor(distances_np, dtype=torch.float)
z_centered_torch = torch.tensor(z_centered_np, dtype=torch.float)

# --- Initial Parameter Setup (Shared) ---
# Calculate target stable parameters
# PHI1 is the Target Ratio: sigma2 / a
PHI1_TARGET = SIGMA2_TRUE / RANGE_A_TRUE    # 42.0 / 1.5 = 28.0
# PHI2 is the Inverse Range: 1 / a
PHI2_TARGET = 1.0 / RANGE_A_TRUE            # 1.0 / 1.5 = 0.667

# üí° NEW: Initial guess for raw_phi1_sqrt and raw_phi2_sqrt. 
# Start Phi1 (Target 28.0) at 32.0 (sqrt 5.66)
raw_phi1_sqrt_start = np.sqrt(PHI1_TARGET + 4.0) 
# Start Phi2 (Target 0.667) at 0.8 (sqrt 0.64)
raw_phi2_sqrt_start = np.sqrt(PHI2_TARGET + 0.133) 

initial_params_stable = [raw_phi1_sqrt_start, raw_phi2_sqrt_start]


# ----------------------------------------------------
# A. Optimization with L-BFGS (PyTorch) - STABLE
# ----------------------------------------------------

# Reset parameters for LBFGS
raw_params_lbfgs = torch.tensor(
    initial_params_stable, 
    dtype=torch.float, 
    requires_grad=True
)

optimizer_lbfgs = optim.LBFGS(
    [raw_params_lbfgs], 
    lr=1.0, 
    max_iter=LBFGS_MAX_STEPS,
    max_eval=LBFGS_MAX_EVAL 
)

final_loss_lbfgs = torch.tensor(0.0)
print("\n--- A. Starting MLE Optimization (PyTorch L-BFGS) - STABLE ---")

# L-BFGS requires a "closure" function
def closure_lbfgs():
    optimizer_lbfgs.zero_grad()
    # Use the stable NLL function
    loss = neg_log_likelihood_torch_stable(raw_params_lbfgs, distances_torch, z_centered_torch, NUGGET)
    if not torch.isinf(loss) and not torch.isnan(loss):
        loss.backward()
    return loss

# L-BFGS Optimization Loop
for step in range(LBFGS_MAX_STEPS):
    loss = optimizer_lbfgs.step(closure_lbfgs)
    final_loss_lbfgs = loss
    
    if (step + 1) % 5 == 0: 
        phi1 = raw_params_lbfgs[0].pow(2).item()
        phi2 = raw_params_lbfgs[1].pow(2).item()
        current_sigma2 = phi1 / (phi2 + 1e-6)
        current_a = 1.0 / (phi2 + 1e-6)
        grad_phi1 = raw_params_lbfgs.grad[0].item() if raw_params_lbfgs.grad is not None else 0.0
        grad_phi2 = raw_params_lbfgs.grad[1].item() if raw_params_lbfgs.grad is not None else 0.0
        print(f"LBFGS Step {step + 1}/{LBFGS_MAX_STEPS}, NLL: {loss.item():.2f}, Params: [œÉ¬≤: {current_sigma2:.3f}, a: {current_a:.3f}], Grads: [Œ¶1_raw: {grad_phi1:.4f}, Œ¶2_raw: {grad_phi2:.4f}]")

# ----------------------------------------------------
# B. Optimization with Adam (PyTorch) - STABLE
# ----------------------------------------------------

# Reset parameters for Adam (Use the same start point)
raw_params_adam = torch.tensor(
    initial_params_stable, 
    dtype=torch.float, 
    requires_grad=True
)

optimizer_adam = optim.Adam(
    [raw_params_adam], 
    lr=ADAM_LEARNING_RATE
)

final_loss_adam = torch.tensor(0.0)
print(f"\n--- B. Starting MLE Optimization (PyTorch Adam) - STABLE ---")

# Adam Optimization Loop
for epoch in range(ADAM_ITERATIONS):
    optimizer_adam.zero_grad()
    
    # Use the stable NLL function
    loss = neg_log_likelihood_torch_stable(raw_params_adam, distances_torch, z_centered_torch, NUGGET)
    
    if torch.isinf(loss) or torch.isnan(loss):
        loss = torch.tensor(1e15, device=loss.device)
        break

    loss.backward()
    optimizer_adam.step()
    final_loss_adam = loss
    
    if (epoch + 1) % 50 == 0: 
        phi1 = raw_params_adam[0].pow(2).item()
        phi2 = raw_params_adam[1].pow(2).item()
        current_sigma2 = phi1 / (phi2 + 1e-6)
        current_a = 1.0 / (phi2 + 1e-6)
        grad_phi1 = raw_params_adam.grad[0].item() if raw_params_adam.grad is not None else 0.0
        grad_phi2 = raw_params_adam.grad[1].item() if raw_params_adam.grad is not None else 0.0
        print(f"Adam Epoch {epoch + 1}/{ADAM_ITERATIONS}, NLL: {loss.item():.2f}, Params: [œÉ¬≤: {current_sigma2:.3f}, a: {current_a:.3f}], Grads: [Œ¶1_raw: {grad_phi1:.4f}, Œ¶2_raw: {grad_phi2:.4f}]")

# ----------------------------------------------------
# 3. Display Results
# ----------------------------------------------------
print("\n" + "="*50)
print(f"TARGET PARAMETERS: Variance (œÉ¬≤)={SIGMA2_TRUE}, Range (a)={RANGE_A_TRUE}, Nugget (Œ∑¬≤)={NUGGET}")
print("="*50)

# L-BFGS Results
phi1_lbfgs = raw_params_lbfgs[0].pow(2).detach().numpy().item()
phi2_lbfgs = raw_params_lbfgs[1].pow(2).detach().numpy().item()
fitted_sigma2_lbfgs = phi1_lbfgs / (phi2_lbfgs + 1e-6)
fitted_range_a_lbfgs = 1.0 / (phi2_lbfgs + 1e-6)

print("‚ú® PyTorch L-BFGS Results (Stable Reparameterization):")
print(f"  * Fitted Variance (œÉ¬≤): {fitted_sigma2_lbfgs:.3f} (Target: {SIGMA2_TRUE})")
print(f"  * Fitted Range (a): {fitted_range_a_lbfgs:.3f} (Target: {RANGE_A_TRUE})")
print(f"  * Fitted Ratio (Œ¶‚ÇÅ=œÉ¬≤/a): {phi1_lbfgs:.3f} (Target Ratio: {PHI1_TARGET:.3f})") # <-- NEW
print(f"  * Final -LL Value: {final_loss_lbfgs.item():.2f}")
print(f"  * Optimization Steps: {LBFGS_MAX_STEPS} steps")

# Adam Results
phi1_adam = raw_params_adam[0].pow(2).detach().numpy().item()
phi2_adam = raw_params_adam[1].pow(2).detach().numpy().item()
fitted_sigma2_adam = phi1_adam / (phi2_adam + 1e-6)
fitted_range_a_adam = 1.0 / (phi2_adam + 1e-6)

print("\nüöÄ PyTorch Adam Results (Stable Reparameterization):")
print(f"  * Fitted Variance (œÉ¬≤): {fitted_sigma2_adam:.3f} (Target: {SIGMA2_TRUE})")
print(f"  * Fitted Range (a): {fitted_range_a_adam:.3f} (Target: {RANGE_A_TRUE})")
print(f"  * Fitted Ratio (Œ¶‚ÇÅ=œÉ¬≤/a): {phi1_adam:.3f} (Target Ratio: {PHI1_TARGET:.3f})") # <-- NEW
print(f"  * Final -LL Value: {final_loss_adam.item():.2f}")
print(f"  * Optimization Steps: {ADAM_ITERATIONS} epochs")
print("="*50)

--- Starting Data Generation ---
--- Data Generation Complete ---

--- A. Starting MLE Optimization (PyTorch L-BFGS) - STABLE ---
LBFGS Step 5/50, NLL: 1812.62, Params: [œÉ¬≤: 18.531, a: 0.779], Grads: [Œ¶1_raw: -0.0001, Œ¶2_raw: 0.0001]
LBFGS Step 10/50, NLL: 1812.62, Params: [œÉ¬≤: 18.531, a: 0.779], Grads: [Œ¶1_raw: -0.0001, Œ¶2_raw: 0.0001]
LBFGS Step 15/50, NLL: 1812.62, Params: [œÉ¬≤: 18.531, a: 0.779], Grads: [Œ¶1_raw: -0.0001, Œ¶2_raw: 0.0001]
LBFGS Step 20/50, NLL: 1812.62, Params: [œÉ¬≤: 18.531, a: 0.779], Grads: [Œ¶1_raw: -0.0001, Œ¶2_raw: 0.0001]
LBFGS Step 25/50, NLL: 1812.62, Params: [œÉ¬≤: 18.531, a: 0.779], Grads: [Œ¶1_raw: -0.0001, Œ¶2_raw: 0.0001]
LBFGS Step 30/50, NLL: 1812.62, Params: [œÉ¬≤: 18.531, a: 0.779], Grads: [Œ¶1_raw: -0.0001, Œ¶2_raw: 0.0001]
LBFGS Step 35/50, NLL: 1812.62, Params: [œÉ¬≤: 18.531, a: 0.779], Grads: [Œ¶1_raw: -0.0001, Œ¶2_raw: 0.0001]
LBFGS Step 40/50, NLL: 1812.62, Params: [œÉ¬≤: 18.531, a: 0.779], Grads: [Œ¶1_raw: -0.0001, Œ¶2_raw: 0.0001]

# L BFGS vs Adams  1120 reparametrization for anisotrpy

In [195]:
import torch
import numpy as np
import torch.optim as optim
from scipy.spatial.distance import cdist

# --- 0. Global Parameters and Utility Functions ---
N_SPATIAL_POINTS = 1120
N_DAYS = 31
N_HOURS_PER_DAY = 8
N_FEATURES = 4
GRID_X = 40  
GRID_Y = 28  
LAT_MIN, LAT_MAX = 0, 5
LON_MIN, LON_MAX = 113, 123
BASE_DATE = '2024_07_y24m07day' 

# Exponential Kernel Parameters (Targets)
SIGMA2_TRUE = 30.0      # TARGET Variance (theta_1 / theta_2)
RANGE_A_TRUE = 1.5      # TARGET Range (1 / theta_2)
ANISOTROPY_RATIO_TRUE = 2.0 # TARGET Anisotropy (theta_3)
# üí° NEW: Define the target for the parameter we actually optimize (theta_3^2)
PHI3_TARGET_SQ = ANISOTROPY_RATIO_TRUE**2 # Target = 4.0
NUGGET = 2.0          # FIXED Nugget

# Optimization Setup
ADAM_ITERATIONS = 500
ADAM_LEARNING_RATE = 0.01

# L-BFGS Setup
LBFGS_MAX_STEPS = 50 
LBFGS_MAX_EVAL = 50 

OZONE_MEAN = 240.0

# --- COVARIANCE FUNCTIONS ---

def exponential_covariance_numpy(distances, sigma2, a, nugget):
    """Exponential covariance function (NumPy for Generation)."""
    cov = sigma2 * np.exp(-distances / a)
    if distances.shape[0] == distances.shape[1]:
        cov[np.diag_indices_from(distances)] += (nugget + 1e-6)
    return cov

def exponential_covariance_torch(distances_torch, sigma2, a, nugget):
    """Exponential covariance function (PyTorch for Optimization)."""
    cov = sigma2 * torch.exp(-distances_torch / a)
    
    if distances_torch.shape[0] == distances_torch.shape[1]:
        jitter = 1e-6 
        diag_mask = torch.eye(cov.shape[0], device=cov.device)
        cov = cov + diag_mask * (nugget + jitter)
    return cov

# --- üí° MODIFIED NLL Function (STABLE FIX) ---
def neg_log_likelihood_torch_stable(raw_params_phi, d_lon_sq_torch, d_lat_sq_torch, z_centered_torch, fixed_nugget):
    """
    Calculates -LL for PyTorch (optimizing Stable Reparameterization).
    raw_params_phi[0] = raw_phi1_sqrt (for theta_1 = sigma2/a)
    raw_params_phi[1] = raw_phi2_sqrt (for theta_2 = 1/a)
    raw_params_phi[2] = raw_phi3_sqrt (for phi_3 = theta_3^2)
    """
    
    epsilon = 1e-6
    
    # 1. üí° FIX: Apply epsilon *after* pow(2) to prevent parameters from being 0
    phi1 = raw_params_phi[0].pow(2).squeeze() + epsilon # theta_1 = sigma2 / a
    phi2 = raw_params_phi[1].pow(2).squeeze() + epsilon # theta_2 = 1 / a
    phi3 = raw_params_phi[2].pow(2).squeeze() + epsilon # üí° phi_3 = theta_3^2 (the squared ratio)
    
    # 2. Derive Original Parameters (no epsilon needed here now)
    range_a = 1.0 / phi2          # Range: a = 1 / theta_2
    sigma2 = phi1 / phi2          # Variance: sigma2 = theta_1 / theta_2
    
    # 3. Compute Anisotropic Distance (no epsilon needed here now)
    # d_aniso^2 = (d_lon / theta_3)^2 + d_lat^2 = d_lon^2 / theta_3^2 + d_lat^2
    aniso_dist_sq = (d_lon_sq_torch / phi3) + d_lat_sq_torch
    
    # üí° Add small epsilon *inside* sqrt to prevent inf gradient at 0
    aniso_dist = torch.sqrt(aniso_dist_sq + epsilon)
    
    # 4. Calculate Covariance Matrix C
    C = exponential_covariance_torch(aniso_dist, sigma2, range_a, fixed_nugget)
    
    try:
        L = torch.linalg.cholesky(C)
        log_det = 2.0 * torch.sum(torch.log(torch.diag(L)))
        alpha = torch.linalg.solve(C, z_centered_torch.unsqueeze(1))
        quad_term = z_centered_torch.unsqueeze(0) @ alpha
        neg_LL = 0.5 * log_det + 0.5 * quad_term.squeeze()
        
        # Check for nan/inf *before* returning
        if torch.isnan(neg_LL) or torch.isinf(neg_LL):
            return torch.tensor(1e15, device=C.device, dtype=torch.float) + raw_params_phi.sum() * 0.0

        return neg_LL
    except RuntimeError:
        # This catches Cholesky failures
        return torch.tensor(1e15, device=C.device, dtype=torch.float) + raw_params_phi.sum() * 0.0


# --- Data Generation Function (Unchanged) ---
def generate_ozone_data_map(coords, sigma2, a, nugget, mean, time_index, anisotropy_ratio):
    n_points = coords.shape[0]
    coords_transformed = coords.copy()
    coords_transformed[:, 1] = coords_transformed[:, 1] / anisotropy_ratio
    
    distances = cdist(coords_transformed, coords_transformed, metric='euclidean')
    
    Cov = exponential_covariance_numpy(distances, sigma2, a, nugget) 
    Cov = (Cov + Cov.T) / 2
    
    try:
        L = np.linalg.cholesky(Cov)
    except np.linalg.LinAlgError:
        return np.zeros((n_points, N_FEATURES))

    W = np.random.normal(0, 1, size=(n_points, 1))
    Z_correlated = L @ W
    ozone_values = mean + Z_correlated
    
    data_np = np.zeros((n_points, N_FEATURES))
    data_np[:, 0:1] = ozone_values             
    data_np[:, 1] = coords[:, 1] * 10 + 2      # Original lon
    data_np[:, 2] = coords[:, 0] * 40 + 250    # Original lat
    data_np[:, 3] = time_index                 
    return data_np


# --- 1. Data Generation Execution ---
df_day_aggregated_list = []
print("--- Starting Data Generation (Anisotropic) ---")
lat_coords = np.linspace(LAT_MIN, LAT_MAX, GRID_Y)
lon_coords = np.linspace(LON_MIN, LON_MAX, GRID_X)
coords_latlon = np.array([[lat, lon] for lat in lat_coords for lon in lon_coords]) # [lat, lon]

data_np = generate_ozone_data_map(
    coords_latlon, SIGMA2_TRUE, RANGE_A_TRUE, NUGGET, OZONE_MEAN, 21.0,
    ANISOTROPY_RATIO_TRUE
)
df_day_aggregated_list.append(torch.tensor(data_np, dtype=torch.float))

print("--- Data Generation Complete ---")

# --- 2. Data Preparation (Unchanged) ---
data_to_fit = df_day_aggregated_list[0][:N_SPATIAL_POINTS, :] 
z_data = data_to_fit[:, 0].numpy()
coordinates = coords_latlon[:, [1, 0]] # Switch to [lon, lat]
z_centered_np = z_data - np.mean(z_data)
z_centered_torch = torch.tensor(z_centered_np, dtype=torch.float)

lons = coordinates[:, 0:1] # (N, 1)
lats = coordinates[:, 1:2] # (N, 1)
d_lon_np = cdist(lons, lons, metric='euclidean')
d_lat_np = cdist(lats, lats, metric='euclidean')
d_lon_sq_np = np.square(d_lon_np)
d_lat_sq_np = np.square(d_lat_np)

d_lon_sq_torch = torch.tensor(d_lon_sq_np, dtype=torch.float)
d_lat_sq_torch = torch.tensor(d_lat_sq_np, dtype=torch.float)


# --- üí° MODIFIED Initial Parameter Setup ---
# Calculate target stable parameters
PHI1_TARGET = SIGMA2_TRUE / RANGE_A_TRUE    # theta_1 = 28.0
PHI2_TARGET = 1.0 / RANGE_A_TRUE            # theta_2 = 0.667
# üí° PHI3 is now the SQUARED ratio
PHI3_TARGET = PHI3_TARGET_SQ                # phi_3 = 4.0

# Initial guess for raw_phi_sqrt
raw_phi1_sqrt_start = np.sqrt(PHI1_TARGET - 3.0) # Start theta_1 at 25.0
raw_phi2_sqrt_start = np.sqrt(PHI2_TARGET - 0.1) # Start theta_2 at ~0.57
# üí° Start phi_3 at 3.0 (Target is 4.0)
raw_phi3_sqrt_start = np.sqrt(3.0) 

initial_params_stable = [
    raw_phi1_sqrt_start, 
    raw_phi2_sqrt_start,
    raw_phi3_sqrt_start
]


# ----------------------------------------------------
# A. Optimization with L-BFGS (PyTorch) - STABLE
# ----------------------------------------------------

raw_params_lbfgs = torch.tensor(
    initial_params_stable, 
    dtype=torch.float, 
    requires_grad=True
)

optimizer_lbfgs = optim.LBFGS(
    [raw_params_lbfgs], 
    lr=1.0, 
    max_iter=LBFGS_MAX_STEPS,
    max_eval=LBFGS_MAX_EVAL 
)

final_loss_lbfgs = torch.tensor(0.0)
print("\n--- A. Starting MLE Optimization (PyTorch L-BFGS) - STABLE ---")

# L-BFGS requires a "closure" function
def closure_lbfgs():
    optimizer_lbfgs.zero_grad()
    loss = neg_log_likelihood_torch_stable(
        raw_params_lbfgs, d_lon_sq_torch, d_lat_sq_torch, z_centered_torch, NUGGET
    )
    # üí° The 'if requires_grad' check is no longer needed
    if not torch.isinf(loss) and not torch.isnan(loss):
        loss.backward()
    return loss

# L-BFGS Optimization Loop
for step in range(LBFGS_MAX_STEPS):
    loss = optimizer_lbfgs.step(closure_lbfgs)
    final_loss_lbfgs = loss
    
    if (step + 1) % 5 == 0: 
        phi1 = raw_params_lbfgs[0].pow(2).item()
        phi2 = raw_params_lbfgs[1].pow(2).item()
        phi3 = raw_params_lbfgs[2].pow(2).item() # üí° This is theta_3^2
        
        current_sigma2 = phi1 / (phi2 + 1e-6)
        current_a = 1.0 / (phi2 + 1e-6)
        # üí° Show the ratio by taking the sqrt
        current_theta_3_ratio = np.sqrt(phi3) 
        
        grad_phi1 = raw_params_lbfgs.grad[0].item() if raw_params_lbfgs.grad is not None else 0.0
        grad_phi2 = raw_params_lbfgs.grad[1].item() if raw_params_lbfgs.grad is not None else 0.0
        grad_phi3 = raw_params_lbfgs.grad[2].item() if raw_params_lbfgs.grad is not None else 0.0
        print(f"LBFGS Step {step + 1}/{LBFGS_MAX_STEPS}, NLL: {loss.item():.2f}, Params: [œÉ¬≤: {current_sigma2:.3f}, a: {current_a:.3f}, Œ∏‚ÇÉ-ratio: {current_theta_3_ratio:.3f}], Grads: [Œ¶1_raw: {grad_phi1:.4f}, Œ¶2_raw: {grad_phi2:.4f}, Œ¶3_raw: {grad_phi3:.4f}]")

# ----------------------------------------------------
# B. Optimization with Adam (PyTorch) - STABLE
# ----------------------------------------------------

raw_params_adam = torch.tensor(
    initial_params_stable, 
    dtype=torch.float, 
    requires_grad=True
)

optimizer_adam = optim.Adam(
    [raw_params_adam], 
    lr=ADAM_LEARNING_RATE
)

final_loss_adam = torch.tensor(0.0)
print(f"\n--- B. Starting MLE Optimization (PyTorch Adam) - STABLE ---")

# Adam Optimization Loop
for epoch in range(ADAM_ITERATIONS):
    optimizer_adam.zero_grad()
    
    loss = neg_log_likelihood_torch_stable(
        raw_params_adam, d_lon_sq_torch, d_lat_sq_torch, z_centered_torch, NUGGET
    )
    
    # üí° The 'if not loss.requires_grad' check is no longer needed
    if torch.isinf(loss) or torch.isnan(loss):
        if (epoch + 1) % 50 == 0: 
            print(f"Adam Epoch {epoch + 1}/{ADAM_ITERATIONS}, Invalid loss. Skipping step.")
        continue # Skip this step

    loss.backward()
    optimizer_adam.step()
    final_loss_adam = loss
    
    if (epoch + 1) % 50 == 0: 
        phi1 = raw_params_adam[0].pow(2).item()
        phi2 = raw_params_adam[1].pow(2).item()
        phi3 = raw_params_adam[2].pow(2).item() # üí° This is theta_3^2
        
        current_sigma2 = phi1 / (phi2 + 1e-6)
        current_a = 1.0 / (phi2 + 1e-6)
        # üí° Show the ratio by taking the sqrt
        current_theta_3_ratio = np.sqrt(phi3)
        
        grad_phi1 = raw_params_adam.grad[0].item() if raw_params_adam.grad is not None else 0.0
        grad_phi2 = raw_params_adam.grad[1].item() if raw_params_adam.grad is not None else 0.0
        grad_phi3 = raw_params_adam.grad[2].item() if raw_params_adam.grad is not None else 0.0
        print(f"Adam Epoch {epoch + 1}/{ADAM_ITERATIONS}, NLL: {loss.item():.2f}, Params: [œÉ¬≤: {current_sigma2:.3f}, a: {current_a:.3f}, Œ∏‚ÇÉ-ratio: {current_theta_3_ratio:.3f}], Grads: [Œ¶1_raw: {grad_phi1:.4f}, Œ¶2_raw: {grad_phi2:.4f}, Œ¶3_raw: {grad_phi3:.4f}]")

# ----------------------------------------------------
# 3. üí° MODIFIED Display Results
# ----------------------------------------------------
print("\n" + "="*75)
print(f"TARGET PARAMETERS: Variance (œÉ¬≤)={SIGMA2_TRUE}, Range (a)={RANGE_A_TRUE}, Anisotropy (Œ∏‚ÇÉ-ratio)={ANISOTROPY_RATIO_TRUE}")
print(f"                 (Derived Targets: Œ∏‚ÇÅ={PHI1_TARGET:.3f}, Œ∏‚ÇÇ={PHI2_TARGET:.3f}, œÜ‚ÇÉ (Œ∏‚ÇÉ¬≤)={PHI3_TARGET:.3f})")
print("="*75)

# L-BFGS Results
phi1_lbfgs = raw_params_lbfgs[0].pow(2).detach().numpy().item()
phi2_lbfgs = raw_params_lbfgs[1].pow(2).detach().numpy().item()
phi3_lbfgs = raw_params_lbfgs[2].pow(2).detach().numpy().item() # üí° This is theta_3^2
fitted_sigma2_lbfgs = phi1_lbfgs / (phi2_lbfgs + 1e-6)
fitted_range_a_lbfgs = 1.0 / (phi2_lbfgs + 1e-6)
fitted_ratio_lbfgs = np.sqrt(phi3_lbfgs) # üí° This is theta_3

print("‚ú® PyTorch L-BFGS Results (Stable Reparameterization):")
print(f"  * Fitted Variance (œÉ¬≤): {fitted_sigma2_lbfgs:.3f} (Target: {SIGMA2_TRUE})")
print(f"  * Fitted Range (a): {fitted_range_a_lbfgs:.3f} (Target: {RANGE_A_TRUE})")
print(f"  * Fitted Anisotropy (Œ∏‚ÇÉ-ratio): {fitted_ratio_lbfgs:.3f} (Target: {ANISOTROPY_RATIO_TRUE})")
print(f"  ---")
print(f"  * Fitted Œ∏‚ÇÅ (œÉ¬≤/a): {phi1_lbfgs:.3f} (Target: {PHI1_TARGET:.3f})")
print(f"  * Fitted Œ∏‚ÇÇ (1/a): {phi2_lbfgs:.3f} (Target: {PHI2_TARGET:.3f})")
print(f"  * Fitted œÜ‚ÇÉ (Œ∏‚ÇÉ¬≤): {phi3_lbfgs:.3f} (Target: {PHI3_TARGET:.3f})")
print(f"  ---")
print(f"  * Final -LL Value: {final_loss_lbfgs.item():.2f}")
print(f"  * Optimization Steps: {LBFGS_MAX_STEPS} steps")

# Adam Results
phi1_adam = raw_params_adam[0].pow(2).detach().numpy().item()
phi2_adam = raw_params_adam[1].pow(2).detach().numpy().item()
phi3_adam = raw_params_adam[2].pow(2).detach().numpy().item() # üí° This is theta_3^2
fitted_sigma2_adam = phi1_adam / (phi2_adam + 1e-6)
fitted_range_a_adam = 1.0 / (phi2_adam + 1e-6)
fitted_ratio_adam = np.sqrt(phi3_adam) # üí° This is theta_3

print("\nüöÄ PyTorch Adam Results (Stable Reparameterization):")
print(f"  * Fitted Variance (œÉ¬≤): {fitted_sigma2_adam:.3f} (Target: {SIGMA2_TRUE})")
print(f"  * Fitted Range (a): {fitted_range_a_adam:.3f} (Target: {RANGE_A_TRUE})")
print(f"  * Fitted Anisotropy (Œ∏‚ÇÉ-ratio): {fitted_ratio_adam:.3f} (Target: {ANISOTROPY_RATIO_TRUE})")
print(f"  ---")
print(f"  * Fitted Œ∏‚ÇÅ (œÉ¬≤/a): {phi1_adam:.3f} (Target: {PHI1_TARGET:.3f})")
print(f"  * Fitted Œ∏‚ÇÇ (1/a): {phi2_adam:.3f} (Target: {PHI2_TARGET:.3f})")
print(f"  * Fitted œÜ‚ÇÉ (Œ∏‚ÇÉ¬≤): {phi3_adam:.3f} (Target: {PHI3_TARGET:.3f})")
print(f"  ---")
print(f"  * Final -LL Value: {final_loss_adam.item():.2f}")
print(f"  * Optimization Steps: {ADAM_ITERATIONS} epochs")
print("="*75)

--- Starting Data Generation (Anisotropic) ---
--- Data Generation Complete ---

--- A. Starting MLE Optimization (PyTorch L-BFGS) - STABLE ---
LBFGS Step 5/50, NLL: 1552.12, Params: [œÉ¬≤: 18.863, a: 0.883, Œ∏‚ÇÉ-ratio: 2.336], Grads: [Œ¶1_raw: -0.0001, Œ¶2_raw: 0.0000, Œ¶3_raw: -0.0001]
LBFGS Step 10/50, NLL: 1552.12, Params: [œÉ¬≤: 18.863, a: 0.883, Œ∏‚ÇÉ-ratio: 2.336], Grads: [Œ¶1_raw: -0.0001, Œ¶2_raw: 0.0000, Œ¶3_raw: -0.0001]
LBFGS Step 15/50, NLL: 1552.12, Params: [œÉ¬≤: 18.863, a: 0.883, Œ∏‚ÇÉ-ratio: 2.336], Grads: [Œ¶1_raw: -0.0001, Œ¶2_raw: 0.0000, Œ¶3_raw: -0.0001]
LBFGS Step 20/50, NLL: 1552.12, Params: [œÉ¬≤: 18.863, a: 0.883, Œ∏‚ÇÉ-ratio: 2.336], Grads: [Œ¶1_raw: -0.0001, Œ¶2_raw: 0.0000, Œ¶3_raw: -0.0001]
LBFGS Step 25/50, NLL: 1552.12, Params: [œÉ¬≤: 18.863, a: 0.883, Œ∏‚ÇÉ-ratio: 2.336], Grads: [Œ¶1_raw: -0.0001, Œ¶2_raw: 0.0000, Œ¶3_raw: -0.0001]
LBFGS Step 30/50, NLL: 1552.12, Params: [œÉ¬≤: 18.863, a: 0.883, Œ∏‚ÇÉ-ratio: 2.336], Grads: [Œ¶1_raw: -0.0001, Œ¶2_raw: 0

use log transformation instead of optimizing raw prams .pow()     torch.exp(raw_log_param) this guarantees positivity without epsilon hack and optimization more lobust

In [3]:
import torch
import numpy as np
import torch.optim as optim
from scipy.spatial.distance import cdist

# --- 0. Global Parameters and Utility Functions ---
N_SPATIAL_POINTS = 1120
N_DAYS = 31
N_HOURS_PER_DAY = 8
N_FEATURES = 4
GRID_X = 40  
GRID_Y = 28  
LAT_MIN, LAT_MAX = 0, 5
LON_MIN, LON_MAX = 113, 123
BASE_DATE = '2024_07_y24m07day' 

# Exponential Kernel Parameters (Targets)
SIGMA2_TRUE = 30.0      # TARGET Variance (theta_1 / theta_2)
RANGE_A_TRUE = 1.5      # TARGET Range (1 / theta_2)
ANISOTROPY_RATIO_TRUE = 2.0 # TARGET Anisotropy (theta_3)
PHI3_TARGET_SQ = ANISOTROPY_RATIO_TRUE**2 # Target = 4.0
NUGGET_TRUE = 3.0       # üí° TARGET Nugget

# Optimization Setup
ADAM_ITERATIONS = 500
ADAM_LEARNING_RATE = 0.001 

# L-BFGS Setup
LBFGS_MAX_STEPS = 50 
LBFGS_MAX_EVAL = 50 

OZONE_MEAN = 240.0

# --- COVARIANCE FUNCTIONS (Unchanged) ---

def exponential_covariance_numpy(distances, sigma2, a, nugget):
    """Exponential covariance function (NumPy for Generation)."""
    cov = sigma2 * np.exp(-distances / a)
    if distances.shape[0] == distances.shape[1]:
        cov[np.diag_indices_from(distances)] += (nugget + 1e-6)
    return cov

def exponential_covariance_torch(distances_torch, sigma2, a, nugget):
    """Exponential covariance function (PyTorch for Optimization)."""
    cov = sigma2 * torch.exp(-distances_torch / a)
    
    if distances_torch.shape[0] == distances_torch.shape[1]:
        jitter = 1e-6 
        diag_mask = torch.eye(cov.shape[0], device=cov.device)
        cov = cov + diag_mask * (nugget + jitter)
    return cov

# --- üí° MODIFIED NLL Function (Log-Reparameterization) ---
def neg_log_likelihood_torch_stable(raw_log_params, d_lon_sq_torch, d_lat_sq_torch, z_centered_torch):
    """
    Calculates -LL for PyTorch (optimizing Log-Reparameterization).
    raw_log_params[0] = log(phi1) (for theta_1 = sigma2/a)
    raw_log_params[1] = log(phi2) (for theta_2 = 1/a)
    raw_log_params[2] = log(phi3) (for phi_3 = theta_3^2)
    raw_log_params[3] = log(nugget) üí° NEW
    """
    
    # 1. üí° Recover parameters using torch.exp()
    phi1   = torch.exp(raw_log_params[0]) # theta_1 = sigma2 / a
    phi2   = torch.exp(raw_log_params[1]) # theta_2 = 1 / a
    phi3   = torch.exp(raw_log_params[2]) # phi_3 = theta_3^2
    nugget = torch.exp(raw_log_params[3]) # üí° NEW: Recover nugget
    
    # 2. Derive Original Parameters
    range_a = 1.0 / phi2          # Range: a = 1 / theta_2
    sigma2 = phi1 / phi2          # Variance: sigma2 = theta_1 / theta_2
    
    # 3. Compute Anisotropic Distance
    aniso_dist_sq = (d_lon_sq_torch / phi3) + d_lat_sq_torch
    
    aniso_dist = torch.sqrt(aniso_dist_sq + 1e-6)
    
    # 4. Calculate Covariance Matrix C
    # üí° Pass the optimized nugget, not a fixed one
    C = exponential_covariance_torch(aniso_dist, sigma2, range_a, nugget) 
    
    try:
        L = torch.linalg.cholesky(C)
        log_det = 2.0 * torch.sum(torch.log(torch.diag(L)))
        alpha = torch.linalg.solve(C, z_centered_torch.unsqueeze(1))
        quad_term = z_centered_torch.unsqueeze(0) @ alpha
        neg_LL = 0.5 * log_det + 0.5 * quad_term.squeeze()
        
        if torch.isnan(neg_LL) or torch.isinf(neg_LL):
            return torch.tensor(1e15, device=C.device, dtype=torch.float) + raw_log_params.sum() * 0.0

        return neg_LL
    except RuntimeError:
        return torch.tensor(1e15, device=C.device, dtype=torch.float) + raw_log_params.sum() * 0.0


# --- Data Generation Function (Unchanged) ---
def generate_ozone_data_map(coords, sigma2, a, nugget, mean, time_index, anisotropy_ratio):
    n_points = coords.shape[0]
    coords_transformed = coords.copy()
    coords_transformed[:, 1] = coords_transformed[:, 1] / anisotropy_ratio
    distances = cdist(coords_transformed, coords_transformed, metric='euclidean')
    Cov = exponential_covariance_numpy(distances, sigma2, a, nugget) 
    Cov = (Cov + Cov.T) / 2
    try:
        L = np.linalg.cholesky(Cov)
    except np.linalg.LinAlgError:
        return np.zeros((n_points, N_FEATURES))
    W = np.random.normal(0, 1, size=(n_points, 1))
    Z_correlated = L @ W
    ozone_values = mean + Z_correlated
    data_np = np.zeros((n_points, N_FEATURES))
    data_np[:, 0:1] = ozone_values             
    data_np[:, 1] = coords[:, 1] * 10 + 2
    data_np[:, 2] = coords[:, 0] * 40 + 250
    data_np[:, 3] = time_index                 
    return data_np


# --- 1. Data Generation Execution ---
df_day_aggregated_list = []
print("--- Starting Data Generation (Anisotropic) ---")
lat_coords = np.linspace(LAT_MIN, LAT_MAX, GRID_Y)
lon_coords = np.linspace(LON_MIN, LON_MAX, GRID_X)
coords_latlon = np.array([[lat, lon] for lat in lat_coords for lon in lon_coords])

data_np = generate_ozone_data_map(
    coords_latlon, SIGMA2_TRUE, RANGE_A_TRUE, NUGGET_TRUE, OZONE_MEAN, 21.0, # üí° Use NUGGET_TRUE
    ANISOTROPY_RATIO_TRUE
)
df_day_aggregated_list.append(torch.tensor(data_np, dtype=torch.float))
print("--- Data Generation Complete ---")

# --- 2. Data Preparation (Unchanged) ---
data_to_fit = df_day_aggregated_list[0][:N_SPATIAL_POINTS, :] 
z_data = data_to_fit[:, 0].numpy()
coordinates = coords_latlon[:, [1, 0]]
z_centered_np = z_data - np.mean(z_data)
z_centered_torch = torch.tensor(z_centered_np, dtype=torch.float)

lons = coordinates[:, 0:1]
lats = coordinates[:, 1:2]
d_lon_np = cdist(lons, lons, metric='euclidean')
d_lat_np = cdist(lats, lats, metric='euclidean')
d_lon_sq_np = np.square(d_lon_np)
d_lat_sq_np = np.square(d_lat_np)

d_lon_sq_torch = torch.tensor(d_lon_sq_np, dtype=torch.float)
d_lat_sq_torch = torch.tensor(d_lat_sq_np, dtype=torch.float)


# --- üí° MODIFIED Initial Parameter Setup (Log-space) ---
# Calculate target stable parameters
PHI1_TARGET = SIGMA2_TRUE / RANGE_A_TRUE    # theta_1 = 20.0
PHI2_TARGET = 1.0 / RANGE_A_TRUE            # theta_2 = 0.667
PHI3_TARGET = PHI3_TARGET_SQ                # phi_3 = 4.0

# Initial guess is now the log() of the target values
raw_log_phi1_start = np.log(PHI1_TARGET + 2.0) # Start off-target
raw_log_phi2_start = np.log(PHI2_TARGET + 1.0) # Start off-target
raw_log_phi3_start = np.log(PHI3_TARGET + 1.0) # Start off-target
raw_log_nugget_start = np.log(1.0)             # üí° NEW: Initial guess for nugget (Target is 3.0)

initial_params_stable = [
    raw_log_phi1_start, 
    raw_log_phi2_start,
    raw_log_phi3_start,
    raw_log_nugget_start  # üí° NEW
]


# ----------------------------------------------------
# A. Optimization with L-BFGS (PyTorch) - LOG-STABLE
# ----------------------------------------------------

raw_params_lbfgs = torch.tensor(
    initial_params_stable, 
    dtype=torch.float, 
    requires_grad=True
)

optimizer_lbfgs = optim.LBFGS(
    [raw_params_lbfgs], 
    lr=1.0, 
    max_iter=LBFGS_MAX_STEPS,
    max_eval=LBFGS_MAX_EVAL 
)

final_loss_lbfgs = torch.tensor(0.0)
print("\n--- A. Starting MLE Optimization (PyTorch L-BFGS) - LOG-STABLE ---")

def closure_lbfgs():
    optimizer_lbfgs.zero_grad()
    # üí° Call NLL function with 4 params (nugget is now inside)
    loss = neg_log_likelihood_torch_stable(
        raw_params_lbfgs, d_lon_sq_torch, d_lat_sq_torch, z_centered_torch
    )
    if not torch.isinf(loss) and not torch.isnan(loss):
        loss.backward()
    return loss

for step in range(LBFGS_MAX_STEPS):
    loss = optimizer_lbfgs.step(closure_lbfgs)
    final_loss_lbfgs = loss
    
    if (step + 1) % 5 == 0: 
        # üí° Recover parameters with .exp()
        phi1 = raw_params_lbfgs[0].exp().item()
        phi2 = raw_params_lbfgs[1].exp().item()
        phi3 = raw_params_lbfgs[2].exp().item()
        nugget = raw_params_lbfgs[3].exp().item() # üí° NEW
        
        current_sigma2 = phi1 / (phi2 + 1e-6)
        current_a = 1.0 / (phi2 + 1e-6)
        current_theta_3_ratio = np.sqrt(phi3) 
        
        grad_phi1 = raw_params_lbfgs.grad[0].item() if raw_params_lbfgs.grad is not None else 0.0
        grad_phi2 = raw_params_lbfgs.grad[1].item() if raw_params_lbfgs.grad is not None else 0.0
        grad_phi3 = raw_params_lbfgs.grad[2].item() if raw_params_lbfgs.grad is not None else 0.0
        grad_nugget = raw_params_lbfgs.grad[3].item() if raw_params_lbfgs.grad is not None else 0.0 # üí° NEW
        
        # üí° Updated print statement
        print(f"LBFGS Step {step + 1}/{LBFGS_MAX_STEPS}, NLL: {loss.item():.2f}, "
              f"Params: [œÉ¬≤: {current_sigma2:.3f}, a: {current_a:.3f}, Œ∏‚ÇÉ-ratio: {current_theta_3_ratio:.3f}, Œ∑¬≤: {nugget:.3f}], "
              f"Grads: [logŒ¶1: {grad_phi1:.4f}, logŒ¶2: {grad_phi2:.4f}, logŒ¶3: {grad_phi3:.4f}, logŒó¬≤: {grad_nugget:.4f}]")

# ----------------------------------------------------
# B. Optimization with Adam (PyTorch) - LOG-STABLE
# ----------------------------------------------------

raw_params_adam = torch.tensor(
    initial_params_stable, 
    dtype=torch.float, 
    requires_grad=True
)

optimizer_adam = optim.Adam(
    [raw_params_adam], 
    lr=ADAM_LEARNING_RATE
)

final_loss_adam = torch.tensor(0.0)
print(f"\n--- B. Starting MLE Optimization (PyTorch Adam) - LOG-STABLE ---")

for epoch in range(ADAM_ITERATIONS):
    optimizer_adam.zero_grad()
    
    # üí° Call NLL function with 4 params
    loss = neg_log_likelihood_torch_stable(
        raw_params_adam, d_lon_sq_torch, d_lat_sq_torch, z_centered_torch
    )
    
    if torch.isinf(loss) or torch.isnan(loss):
        if (epoch + 1) % 50 == 0: 
            print(f"Adam Epoch {epoch + 1}/{ADAM_ITERATIONS}, Invalid loss. Skipping step.")
        continue

    loss.backward()
    optimizer_adam.step()
    final_loss_adam = loss
    
    if (epoch + 1) % 50 == 0: 
        # üí° Recover parameters with .exp()
        phi1 = raw_params_adam[0].exp().item()
        phi2 = raw_params_adam[1].exp().item()
        phi3 = raw_params_adam[2].exp().item()
        nugget = raw_params_adam[3].exp().item() # üí° NEW
        
        current_sigma2 = phi1 / (phi2 + 1e-6)
        current_a = 1.0 / (phi2 + 1e-6)
        current_theta_3_ratio = np.sqrt(phi3)
        
        grad_phi1 = raw_params_adam.grad[0].item() if raw_params_adam.grad is not None else 0.0
        grad_phi2 = raw_params_adam.grad[1].item() if raw_params_adam.grad is not None else 0.0
        grad_phi3 = raw_params_adam.grad[2].item() if raw_params_adam.grad is not None else 0.0
        grad_nugget = raw_params_adam.grad[3].item() if raw_params_adam.grad is not None else 0.0 # üí° NEW
        
        # üí° Updated print statement
        print(f"Adam Epoch {epoch + 1}/{ADAM_ITERATIONS}, NLL: {loss.item():.2f}, "
              f"Params: [œÉ¬≤: {current_sigma2:.3f}, a: {current_a:.3f}, Œ∏‚ÇÉ-ratio: {current_theta_3_ratio:.3f}, Œ∑¬≤: {nugget:.3f}], "
              f"Grads: [logŒ¶1: {grad_phi1:.4f}, logŒ¶2: {grad_phi2:.4f}, logŒ¶3: {grad_phi3:.4f}, logŒó¬≤: {grad_nugget:.4f}]")

# ----------------------------------------------------
# 3. üí° MODIFIED Display Results
# ----------------------------------------------------
print("\n" + "="*75)
print(f"TARGET PARAMETERS: Variance (œÉ¬≤)={SIGMA2_TRUE}, Range (a)={RANGE_A_TRUE}, "
      f"Anisotropy (Œ∏‚ÇÉ-ratio)={ANISOTROPY_RATIO_TRUE}, Nugget (Œ∑¬≤)={NUGGET_TRUE}") # üí° Updated
print(f"                 (Derived Targets: Œ∏‚ÇÅ={PHI1_TARGET:.3f}, Œ∏‚ÇÇ={PHI2_TARGET:.3f}, œÜ‚ÇÉ (Œ∏‚ÇÉ¬≤)={PHI3_TARGET:.3f})")
print("="*75)

# L-BFGS Results
# üí° Recover parameters with .exp()
phi1_lbfgs = raw_params_lbfgs[0].exp().detach().numpy().item()
phi2_lbfgs = raw_params_lbfgs[1].exp().detach().numpy().item()
phi3_lbfgs = raw_params_lbfgs[2].exp().detach().numpy().item()
fitted_nugget_lbfgs = raw_params_lbfgs[3].exp().detach().numpy().item() # üí° NEW

fitted_sigma2_lbfgs = phi1_lbfgs / (phi2_lbfgs + 1e-6)
fitted_range_a_lbfgs = 1.0 / (phi2_lbfgs + 1e-6)
fitted_ratio_lbfgs = np.sqrt(phi3_lbfgs)

print("‚ú® PyTorch L-BFGS Results (Log-Stable Reparameterization):")
print(f"  * Fitted Variance (œÉ¬≤): {fitted_sigma2_lbfgs:.3f} (Target: {SIGMA2_TRUE})")
print(f"  * Fitted Range (a): {fitted_range_a_lbfgs:.3f} (Target: {RANGE_A_TRUE})")
print(f"  * Fitted Anisotropy (Œ∏‚ÇÉ-ratio): {fitted_ratio_lbfgs:.3f} (Target: {ANISOTROPY_RATIO_TRUE})")
print(f"  * Fitted Nugget (Œ∑¬≤): {fitted_nugget_lbfgs:.3f} (Target: {NUGGET_TRUE})") # üí° NEW
print(f"  ---")
print(f"  * Fitted Œ∏‚ÇÅ (œÉ¬≤/a): {phi1_lbfgs:.3f} (Target: {PHI1_TARGET:.3f})")
print(f"  * Fitted Œ∏‚ÇÇ (1/a): {phi2_lbfgs:.3f} (Target: {PHI2_TARGET:.3f})")
print(f"  * Fitted œÜ‚ÇÉ (Œ∏‚ÇÉ¬≤): {phi3_lbfgs:.3f} (Target: {PHI3_TARGET:.3f})")
print(f"  ---")
print(f"  * Final -LL Value: {final_loss_lbfgs.item():.2f}")

# --- üí° NEW: Sanity Check for L-BFGS ---
print(f"  * --- Sanity Check (œÉ¬≤_hat + 1) ---")
sigma2_perturbed_lbfgs = fitted_sigma2_lbfgs + 1.0
phi1_perturbed_lbfgs = sigma2_perturbed_lbfgs * phi2_lbfgs

# üí° Create the 4-element perturbed tensor
raw_params_lbfgs_perturbed = torch.tensor([
    np.log(phi1_perturbed_lbfgs),      # New log(phi1)
    raw_params_lbfgs[1].item(),        # Old log(phi2)
    raw_params_lbfgs[2].item(),        # Old log(phi3)
    raw_params_lbfgs[3].item()         # üí° Old log(nugget)
], dtype=torch.float)

with torch.no_grad(): # Ensure no gradients are computed
    # üí° Call NLL function without fixed nugget
    nll_perturbed_lbfgs = neg_log_likelihood_torch_stable(
        raw_params_lbfgs_perturbed, 
        d_lon_sq_torch, 
        d_lat_sq_torch, 
        z_centered_torch
    )
print(f"  * NLL @ (œÉ¬≤_hat + 1.0): {nll_perturbed_lbfgs.item():.2f} (Change: {nll_perturbed_lbfgs.item() - final_loss_lbfgs.item():.2f})")
# --- End Sanity Check ---

print(f"  * Optimization Steps: {LBFGS_MAX_STEPS} steps")


# Adam Results
# üí° Recover parameters with .exp()
phi1_adam = raw_params_adam[0].exp().detach().numpy().item()
phi2_adam = raw_params_adam[1].exp().detach().numpy().item()
phi3_adam = raw_params_adam[2].exp().detach().numpy().item()
fitted_nugget_adam = raw_params_adam[3].exp().detach().numpy().item() # üí° NEW

fitted_sigma2_adam = phi1_adam / (phi2_adam + 1e-6)
fitted_range_a_adam = 1.0 / (phi2_adam + 1e-6)
fitted_ratio_adam = np.sqrt(phi3_adam)

print("\nüöÄ PyTorch Adam Results (Log-Stable Reparameterization):")
print(f"  * Fitted Variance (œÉ¬≤): {fitted_sigma2_adam:.3f} (Target: {SIGMA2_TRUE})")
print(f"  * Fitted Range (a): {fitted_range_a_adam:.3f} (Target: {RANGE_A_TRUE})")
print(f"  * Fitted Anisotropy (Œ∏‚ÇÉ-ratio): {fitted_ratio_adam:.3f} (Target: {ANISOTROPY_RATIO_TRUE})")
print(f"  * Fitted Nugget (Œ∑¬≤): {fitted_nugget_adam:.3f} (Target: {NUGGET_TRUE})") # üí° NEW
print(f"  ---")
print(f"  * Fitted Œ∏‚ÇÅ (œÉ¬≤/a): {phi1_adam:.3f} (Target: {PHI1_TARGET:.3f})")
print(f"  * Fitted Œ∏‚ÇÇ (1/a): {phi2_adam:.3f} (Target: {PHI2_TARGET:.3f})")
print(f"  * Fitted œÜ‚ÇÉ (Œ∏‚ÇÉ¬≤): {phi3_adam:.3f} (Target: {PHI3_TARGET:.3f})")
print(f"  ---")
print(f"  * Final -LL Value: {final_loss_adam.item():.2f}")

# --- üí° NEW: Sanity Check for Adam ---
print(f"  * --- Sanity Check (œÉ¬≤_hat + 1) ---")
sigma2_perturbed_adam = fitted_sigma2_adam + 1.0
phi1_perturbed_adam = sigma2_perturbed_adam * phi2_adam

# üí° Create the 4-element perturbed tensor
raw_params_adam_perturbed = torch.tensor([
    np.log(phi1_perturbed_adam),     # New log(phi1)
    raw_params_adam[1].item(),       # Old log(phi2)
    raw_params_adam[2].item(),       # Old log(phi3)
    raw_params_adam[3].item()        # üí° Old log(nugget)
], dtype=torch.float)

with torch.no_grad():
    # üí° Call NLL function without fixed nugget
    nll_perturbed_adam = neg_log_likelihood_torch_stable(
        raw_params_adam_perturbed, 
        d_lon_sq_torch, 
        d_lat_sq_torch, 
        z_centered_torch
    )
print(f"  * NLL @ (œÉ¬≤_hat + 1.0): {nll_perturbed_adam.item():.2f} (Change: {nll_perturbed_adam.item() - final_loss_adam.item():.2f})")
# --- End Sanity Check ---

print(f"  * Optimization Steps: {ADAM_ITERATIONS} epochs")
print("="*75)

--- Starting Data Generation (Anisotropic) ---
--- Data Generation Complete ---

--- A. Starting MLE Optimization (PyTorch L-BFGS) - LOG-STABLE ---
LBFGS Step 5/50, NLL: 1728.45, Params: [œÉ¬≤: 27.037, a: 1.182, Œ∏‚ÇÉ-ratio: 1.830, Œ∑¬≤: 3.075], Grads: [logŒ¶1: 0.0002, logŒ¶2: -0.0001, logŒ¶3: 0.0000, logŒó¬≤: 0.0003]
LBFGS Step 10/50, NLL: 1728.45, Params: [œÉ¬≤: 27.037, a: 1.182, Œ∏‚ÇÉ-ratio: 1.830, Œ∑¬≤: 3.075], Grads: [logŒ¶1: 0.0002, logŒ¶2: -0.0001, logŒ¶3: 0.0000, logŒó¬≤: 0.0003]
LBFGS Step 15/50, NLL: 1728.45, Params: [œÉ¬≤: 27.037, a: 1.182, Œ∏‚ÇÉ-ratio: 1.830, Œ∑¬≤: 3.075], Grads: [logŒ¶1: 0.0002, logŒ¶2: -0.0001, logŒ¶3: 0.0000, logŒó¬≤: 0.0003]
LBFGS Step 20/50, NLL: 1728.45, Params: [œÉ¬≤: 27.037, a: 1.182, Œ∏‚ÇÉ-ratio: 1.830, Œ∑¬≤: 3.075], Grads: [logŒ¶1: 0.0002, logŒ¶2: -0.0001, logŒ¶3: 0.0000, logŒó¬≤: 0.0003]
LBFGS Step 25/50, NLL: 1728.45, Params: [œÉ¬≤: 27.037, a: 1.182, Œ∏‚ÇÉ-ratio: 1.830, Œ∑¬≤: 3.075], Grads: [logŒ¶1: 0.0002, logŒ¶2: -0.0001, logŒ¶3: 0.0000, logŒó

In [4]:
import torch
import numpy as np
import torch.optim as optim
from scipy.spatial.distance import cdist

# --- 0. Global Parameters and Utility Functions ---
N_SPATIAL_POINTS = 1120
N_DAYS = 31
N_HOURS_PER_DAY = 8
N_FEATURES = 4
GRID_X = 40  
GRID_Y = 28  
LAT_MIN, LAT_MAX = 0, 5
LON_MIN, LON_MAX = 113, 123
BASE_DATE = '2024_07_y24m07day' 

# Exponential Kernel Parameters (Targets)
SIGMA2_TRUE = 30.0      # TARGET Variance (theta_1 / theta_2)
RANGE_A_TRUE = 1.5      # TARGET Range (1 / theta_2)
ANISOTROPY_RATIO_TRUE = 2.0 # TARGET Anisotropy (theta_3)
PHI3_TARGET_SQ = ANISOTROPY_RATIO_TRUE**2 # Target = 4.0
NUGGET_TRUE = 3.0       # üí° TARGET Nugget

# Optimization Setup
ADAM_ITERATIONS = 500
ADAM_LEARNING_RATE = 0.001 

# L-BFGS Setup
LBFGS_MAX_STEPS = 50 
LBFGS_MAX_EVAL = 50 

OZONE_MEAN = 240.0

# --- COVARIANCE FUNCTIONS (Unchanged) ---

def exponential_covariance_numpy(distances, sigma2, a, nugget):
    """Exponential covariance function (NumPy for Generation)."""
    cov = sigma2 * np.exp(-distances / a)
    if distances.shape[0] == distances.shape[1]:
        cov[np.diag_indices_from(distances)] += (nugget + 1e-6)
    return cov

def exponential_covariance_torch(distances_torch, sigma2, a, nugget):
    """Exponential covariance function (PyTorch for Optimization)."""
    cov = sigma2 * torch.exp(-distances_torch / a)
    
    if distances_torch.shape[0] == distances_torch.shape[1]:
        jitter = 1e-6 
        diag_mask = torch.eye(cov.shape[0], device=cov.device)
        cov = cov + diag_mask * (nugget + jitter)
    return cov

# --- üí° MODIFIED NLL Function (Log-Reparameterization) ---
def neg_log_likelihood_torch_stable(raw_log_params, d_lon_sq_torch, d_lat_sq_torch, z_centered_torch):
    """
    Calculates -LL for PyTorch (optimizing Log-Reparameterization).
    raw_log_params[0] = log(phi1) (for theta_1 = sigma2/a)
    raw_log_params[1] = log(phi2) (for theta_2 = 1/a)
    raw_log_params[2] = log(phi3) (for phi_3 = theta_3^2)
    raw_log_params[3] = log(nugget) üí° NEW
    """
    
    # 1. üí° Recover parameters using torch.exp()
    phi1   = torch.exp(raw_log_params[0]) # theta_1 = sigma2 / a
    phi2   = torch.exp(raw_log_params[1]) # theta_2 = 1 / a
    phi3   = torch.exp(raw_log_params[2]) # phi_3 = theta_3^2
    nugget = torch.exp(raw_log_params[3]) # üí° NEW: Recover nugget
    
    # 2. Derive Original Parameters
    range_a = 1.0 / phi2          # Range: a = 1 / theta_2
    sigma2 = phi1 / phi2          # Variance: sigma2 = theta_1 / theta_2
    
    # 3. Compute Anisotropic Distance
    aniso_dist_sq = (d_lon_sq_torch / phi3) + d_lat_sq_torch
    
    aniso_dist = torch.sqrt(aniso_dist_sq + 1e-6)
    
    # 4. Calculate Covariance Matrix C
    # üí° Pass the optimized nugget, not a fixed one
    C = exponential_covariance_torch(aniso_dist, sigma2, range_a, nugget) 
    
    try:
        L = torch.linalg.cholesky(C)
        log_det = 2.0 * torch.sum(torch.log(torch.diag(L)))
        alpha = torch.linalg.solve(C, z_centered_torch.unsqueeze(1))
        quad_term = z_centered_torch.unsqueeze(0) @ alpha
        neg_LL = 0.5 * log_det + 0.5 * quad_term.squeeze()
        
        if torch.isnan(neg_LL) or torch.isinf(neg_LL):
            return torch.tensor(1e15, device=C.device, dtype=torch.float) + raw_log_params.sum() * 0.0

        return neg_LL
    except RuntimeError:
        return torch.tensor(1e15, device=C.device, dtype=torch.float) + raw_log_params.sum() * 0.0


# --- üí° NEW: Sanity Check Helper Function ---
def check_nll_from_interpretable(sigma2, range_a, aniso_ratio, nugget, 
                                d_lon_sq_torch, d_lat_sq_torch, z_centered_torch):
    """
    Helper function to calculate NLL from interpretable parameters.
    Used for sanity checks.
    """
    # 1. Convert interpretable params to phi-space
    # Add small epsilon to prevent log(0) or divide by zero
    range_a = max(range_a, 1e-6)
    aniso_ratio = max(aniso_ratio, 1e-6)
    nugget = max(nugget, 1e-6)

    phi2 = 1.0 / range_a
    phi1 = sigma2 * phi2
    phi3 = aniso_ratio**2
    
    # 2. Create the raw_log_params tensor
    raw_log_params = torch.tensor([
        np.log(phi1), 
        np.log(phi2), 
        np.log(phi3), 
        np.log(nugget)
    ], dtype=torch.float)
    
    # 3. Calculate NLL
    with torch.no_grad():
        nll = neg_log_likelihood_torch_stable(
            raw_log_params, 
            d_lon_sq_torch, 
            d_lat_sq_torch, 
            z_centered_torch
        )
    return nll.item()


# --- Data Generation Function (Unchanged) ---
def generate_ozone_data_map(coords, sigma2, a, nugget, mean, time_index, anisotropy_ratio):
    n_points = coords.shape[0]
    coords_transformed = coords.copy()
    coords_transformed[:, 1] = coords_transformed[:, 1] / anisotropy_ratio
    distances = cdist(coords_transformed, coords_transformed, metric='euclidean')
    Cov = exponential_covariance_numpy(distances, sigma2, a, nugget) 
    Cov = (Cov + Cov.T) / 2
    try:
        L = np.linalg.cholesky(Cov)
    except np.linalg.LinAlgError:
        return np.zeros((n_points, N_FEATURES))
    W = np.random.normal(0, 1, size=(n_points, 1))
    Z_correlated = L @ W
    ozone_values = mean + Z_correlated
    data_np = np.zeros((n_points, N_FEATURES))
    data_np[:, 0:1] = ozone_values             
    data_np[:, 1] = coords[:, 1] * 10 + 2
    data_np[:, 2] = coords[:, 0] * 40 + 250
    data_np[:, 3] = time_index                 
    return data_np


# --- 1. Data Generation Execution ---
df_day_aggregated_list = []
print("--- Starting Data Generation (Anisotropic) ---")
lat_coords = np.linspace(LAT_MIN, LAT_MAX, GRID_Y)
lon_coords = np.linspace(LON_MIN, LON_MAX, GRID_X)
coords_latlon = np.array([[lat, lon] for lat in lat_coords for lon in lon_coords])

data_np = generate_ozone_data_map(
    coords_latlon, SIGMA2_TRUE, RANGE_A_TRUE, NUGGET_TRUE, OZONE_MEAN, 21.0, # üí° Use NUGGET_TRUE
    ANISOTROPY_RATIO_TRUE
)
df_day_aggregated_list.append(torch.tensor(data_np, dtype=torch.float))
print("--- Data Generation Complete ---")

# --- 2. Data Preparation (Unchanged) ---
data_to_fit = df_day_aggregated_list[0][:N_SPATIAL_POINTS, :] 
z_data = data_to_fit[:, 0].numpy()
coordinates = coords_latlon[:, [1, 0]]
z_centered_np = z_data - np.mean(z_data)
z_centered_torch = torch.tensor(z_centered_np, dtype=torch.float)

lons = coordinates[:, 0:1]
lats = coordinates[:, 1:2]
d_lon_np = cdist(lons, lons, metric='euclidean')
d_lat_np = cdist(lats, lats, metric='euclidean')
d_lon_sq_np = np.square(d_lon_np)
d_lat_sq_np = np.square(d_lat_np)

d_lon_sq_torch = torch.tensor(d_lon_sq_np, dtype=torch.float)
d_lat_sq_torch = torch.tensor(d_lat_sq_np, dtype=torch.float)


# --- üí° MODIFIED Initial Parameter Setup (Log-space) ---
# Calculate target stable parameters
PHI1_TARGET = SIGMA2_TRUE / RANGE_A_TRUE    # theta_1 = 20.0
PHI2_TARGET = 1.0 / RANGE_A_TRUE            # theta_2 = 0.667
PHI3_TARGET = PHI3_TARGET_SQ                # phi_3 = 4.0

# Initial guess is now the log() of the target values
raw_log_phi1_start = np.log(PHI1_TARGET + 2.0) # Start off-target
raw_log_phi2_start = np.log(PHI2_TARGET + 1.0) # Start off-target
raw_log_phi3_start = np.log(PHI3_TARGET + 1.0) # Start off-target
raw_log_nugget_start = np.log(1.0)             # üí° NEW: Initial guess for nugget (Target is 3.0)

initial_params_stable = [
    raw_log_phi1_start, 
    raw_log_phi2_start,
    raw_log_phi3_start,
    raw_log_nugget_start  # üí° NEW
]


# ----------------------------------------------------
# A. Optimization with L-BFGS (PyTorch) - LOG-STABLE
# ----------------------------------------------------

raw_params_lbfgs = torch.tensor(
    initial_params_stable, 
    dtype=torch.float, 
    requires_grad=True
)

optimizer_lbfgs = optim.LBFGS(
    [raw_params_lbfgs], 
    lr=1.0, 
    max_iter=LBFGS_MAX_STEPS,
    max_eval=LBFGS_MAX_EVAL 
)

final_loss_lbfgs = torch.tensor(0.0)
print("\n--- A. Starting MLE Optimization (PyTorch L-BFGS) - LOG-STABLE ---")

def closure_lbfgs():
    optimizer_lbfgs.zero_grad()
    # üí° Call NLL function with 4 params (nugget is now inside)
    loss = neg_log_likelihood_torch_stable(
        raw_params_lbfgs, d_lon_sq_torch, d_lat_sq_torch, z_centered_torch
    )
    if not torch.isinf(loss) and not torch.isnan(loss):
        loss.backward()
    return loss

for step in range(LBFGS_MAX_STEPS):
    loss = optimizer_lbfgs.step(closure_lbfgs)
    final_loss_lbfgs = loss
    
    if (step + 1) % 5 == 0: 
        # üí° Recover parameters with .exp()
        phi1 = raw_params_lbfgs[0].exp().item()
        phi2 = raw_params_lbfgs[1].exp().item()
        phi3 = raw_params_lbfgs[2].exp().item()
        nugget = raw_params_lbfgs[3].exp().item() # üí° NEW
        
        current_sigma2 = phi1 / (phi2 + 1e-6)
        current_a = 1.0 / (phi2 + 1e-6)
        current_theta_3_ratio = np.sqrt(phi3) 
        
        grad_phi1 = raw_params_lbfgs.grad[0].item() if raw_params_lbfgs.grad is not None else 0.0
        grad_phi2 = raw_params_lbfgs.grad[1].item() if raw_params_lbfgs.grad is not None else 0.0
        grad_phi3 = raw_params_lbfgs.grad[2].item() if raw_params_lbfgs.grad is not None else 0.0
        grad_nugget = raw_params_lbfgs.grad[3].item() if raw_params_lbfgs.grad is not None else 0.0 # üí° NEW
        
        # üí° Updated print statement
        print(f"LBFGS Step {step + 1}/{LBFGS_MAX_STEPS}, NLL: {loss.item():.2f}, "
              f"Params: [œÉ¬≤: {current_sigma2:.3f}, a: {current_a:.3f}, Œ∏‚ÇÉ-ratio: {current_theta_3_ratio:.3f}, Œ∑¬≤: {nugget:.3f}], "
              f"Grads: [logŒ¶1: {grad_phi1:.4f}, logŒ¶2: {grad_phi2:.4f}, logŒ¶3: {grad_phi3:.4f}, logŒó¬≤: {grad_nugget:.4f}]")

# ----------------------------------------------------
# B. Optimization with Adam (PyTorch) - LOG-STABLE
# ----------------------------------------------------

raw_params_adam = torch.tensor(
    initial_params_stable, 
    dtype=torch.float, 
    requires_grad=True
)

optimizer_adam = optim.Adam(
    [raw_params_adam], 
    lr=ADAM_LEARNING_RATE
)

final_loss_adam = torch.tensor(0.0)
print(f"\n--- B. Starting MLE Optimization (PyTorch Adam) - LOG-STABLE ---")

for epoch in range(ADAM_ITERATIONS):
    optimizer_adam.zero_grad()
    
    # üí° Call NLL function with 4 params
    loss = neg_log_likelihood_torch_stable(
        raw_params_adam, d_lon_sq_torch, d_lat_sq_torch, z_centered_torch
    )
    
    if torch.isinf(loss) or torch.isnan(loss):
        if (epoch + 1) % 50 == 0: 
            print(f"Adam Epoch {epoch + 1}/{ADAM_ITERATIONS}, Invalid loss. Skipping step.")
        continue

    loss.backward()
    optimizer_adam.step()
    final_loss_adam = loss
    
    if (epoch + 1) % 50 == 0: 
        # üí° Recover parameters with .exp()
        phi1 = raw_params_adam[0].exp().item()
        phi2 = raw_params_adam[1].exp().item()
        phi3 = raw_params_adam[2].exp().item()
        nugget = raw_params_adam[3].exp().item() # üí° NEW
        
        current_sigma2 = phi1 / (phi2 + 1e-6)
        current_a = 1.0 / (phi2 + 1e-6)
        current_theta_3_ratio = np.sqrt(phi3)
        
        grad_phi1 = raw_params_adam.grad[0].item() if raw_params_adam.grad is not None else 0.0
        grad_phi2 = raw_params_adam.grad[1].item() if raw_params_adam.grad is not None else 0.0
        grad_phi3 = raw_params_adam.grad[2].item() if raw_params_adam.grad is not None else 0.0
        grad_nugget = raw_params_adam.grad[3].item() if raw_params_adam.grad is not None else 0.0 # üí° NEW
        
        # üí° Updated print statement
        print(f"Adam Epoch {epoch + 1}/{ADAM_ITERATIONS}, NLL: {loss.item():.2f}, "
              f"Params: [œÉ¬≤: {current_sigma2:.3f}, a: {current_a:.3f}, Œ∏‚ÇÉ-ratio: {current_theta_3_ratio:.3f}, Œ∑¬≤: {nugget:.3f}], "
              f"Grads: [logŒ¶1: {grad_phi1:.4f}, logŒ¶2: {grad_phi2:.4f}, logŒ¶3: {grad_phi3:.4f}, logŒó¬≤: {grad_nugget:.4f}]")

# ----------------------------------------------------
# 3. üí° MODIFIED Display Results
# ----------------------------------------------------
print("\n" + "="*75)
print(f"TARGET PARAMETERS: Variance (œÉ¬≤)={SIGMA2_TRUE}, Range (a)={RANGE_A_TRUE}, "
      f"Anisotropy (Œ∏‚ÇÉ-ratio)={ANISOTROPY_RATIO_TRUE}, Nugget (Œ∑¬≤)={NUGGET_TRUE}") # üí° Updated
print(f"                 (Derived Targets: Œ∏‚ÇÅ={PHI1_TARGET:.3f}, Œ∏‚ÇÇ={PHI2_TARGET:.3f}, œÜ‚ÇÉ (Œ∏‚ÇÉ¬≤)={PHI3_TARGET:.3f})")
print("="*75)

# --- L-BFGS Results ---
phi1_lbfgs = raw_params_lbfgs[0].exp().detach().numpy().item()
phi2_lbfgs = raw_params_lbfgs[1].exp().detach().numpy().item()
phi3_lbfgs = raw_params_lbfgs[2].exp().detach().numpy().item()
fitted_nugget_lbfgs = raw_params_lbfgs[3].exp().detach().numpy().item() 

fitted_sigma2_lbfgs = phi1_lbfgs / (phi2_lbfgs + 1e-6)
fitted_range_a_lbfgs = 1.0 / (phi2_lbfgs + 1e-6)
fitted_ratio_lbfgs = np.sqrt(phi3_lbfgs)
nll_final_lbfgs = final_loss_lbfgs.item()

print("‚ú® PyTorch L-BFGS Results (Log-Stable Reparameterization):")
print(f"  * Fitted Variance (œÉ¬≤): {fitted_sigma2_lbfgs:.3f} (Target: {SIGMA2_TRUE})")
print(f"  * Fitted Range (a): {fitted_range_a_lbfgs:.3f} (Target: {RANGE_A_TRUE})")
print(f"  * Fitted Anisotropy (Œ∏‚ÇÉ-ratio): {fitted_ratio_lbfgs:.3f} (Target: {ANISOTROPY_RATIO_TRUE})")
print(f"  * Fitted Nugget (Œ∑¬≤): {fitted_nugget_lbfgs:.3f} (Target: {NUGGET_TRUE})") 
print(f"  ---")
print(f"  * Fitted Œ∏‚ÇÅ (œÉ¬≤/a): {phi1_lbfgs:.3f} (Target: {PHI1_TARGET:.3f})")
print(f"  * Fitted Œ∏‚ÇÇ (1/a): {phi2_lbfgs:.3f} (Target: {PHI2_TARGET:.3f})")
print(f"  * Fitted œÜ‚ÇÉ (Œ∏‚ÇÉ¬≤): {phi3_lbfgs:.3f} (Target: {PHI3_TARGET:.3f})")
print(f"  ---")
print(f"  * Final -LL Value: {nll_final_lbfgs:.2f}")

# --- üí° NEW: Expanded Sanity Check for L-BFGS ---
print(f"  * --- Sanity Check (L-BFGS) ---")
# Check œÉ¬≤
s2_plus = check_nll_from_interpretable(fitted_sigma2_lbfgs + 1.0, fitted_range_a_lbfgs, fitted_ratio_lbfgs, fitted_nugget_lbfgs, d_lon_sq_torch, d_lat_sq_torch, z_centered_torch)
s2_minus = check_nll_from_interpretable(fitted_sigma2_lbfgs - 1.0, fitted_range_a_lbfgs, fitted_ratio_lbfgs, fitted_nugget_lbfgs, d_lon_sq_torch, d_lat_sq_torch, z_centered_torch)
print(f"  * NLL @ œÉ¬≤_hat + 1.0: {s2_plus:.2f} (Change: {s2_plus - nll_final_lbfgs:.2f})")
print(f"  * NLL @ œÉ¬≤_hat - 1.0: {s2_minus:.2f} (Change: {s2_minus - nll_final_lbfgs:.2f})")
# Check Range
a_plus = check_nll_from_interpretable(fitted_sigma2_lbfgs, fitted_range_a_lbfgs + 0.1, fitted_ratio_lbfgs, fitted_nugget_lbfgs, d_lon_sq_torch, d_lat_sq_torch, z_centered_torch)
a_minus = check_nll_from_interpretable(fitted_sigma2_lbfgs, fitted_range_a_lbfgs - 0.1, fitted_ratio_lbfgs, fitted_nugget_lbfgs, d_lon_sq_torch, d_lat_sq_torch, z_centered_torch)
print(f"  * NLL @ a_hat + 0.1: {a_plus:.2f} (Change: {a_plus - nll_final_lbfgs:.2f})")
print(f"  * NLL @ a_hat - 0.1: {a_minus:.2f} (Change: {a_minus - nll_final_lbfgs:.2f})")
# Check Anisotropy
r_plus = check_nll_from_interpretable(fitted_sigma2_lbfgs, fitted_range_a_lbfgs, fitted_ratio_lbfgs + 0.1, fitted_nugget_lbfgs, d_lon_sq_torch, d_lat_sq_torch, z_centered_torch)
r_minus = check_nll_from_interpretable(fitted_sigma2_lbfgs, fitted_range_a_lbfgs, fitted_ratio_lbfgs - 0.1, fitted_nugget_lbfgs, d_lon_sq_torch, d_lat_sq_torch, z_centered_torch)
print(f"  * NLL @ Œ∏‚ÇÉ_hat + 0.1: {r_plus:.2f} (Change: {r_plus - nll_final_lbfgs:.2f})")
print(f"  * NLL @ Œ∏‚ÇÉ_hat - 0.1: {r_minus:.2f} (Change: {r_minus - nll_final_lbfgs:.2f})")
# Check Nugget
n_plus = check_nll_from_interpretable(fitted_sigma2_lbfgs, fitted_range_a_lbfgs, fitted_ratio_lbfgs, fitted_nugget_lbfgs + 0.1, d_lon_sq_torch, d_lat_sq_torch, z_centered_torch)
n_minus = check_nll_from_interpretable(fitted_sigma2_lbfgs, fitted_range_a_lbfgs, fitted_ratio_lbfgs, fitted_nugget_lbfgs - 0.1, d_lon_sq_torch, d_lat_sq_torch, z_centered_torch)
print(f"  * NLL @ Œ∑¬≤_hat + 0.1: {n_plus:.2f} (Change: {n_plus - nll_final_lbfgs:.2f})")
print(f"  * NLL @ Œ∑¬≤_hat - 0.1: {n_minus:.2f} (Change: {n_minus - nll_final_lbfgs:.2f})")

print(f"  * Optimization Steps: {LBFGS_MAX_STEPS} steps")


# --- Adam Results ---
phi1_adam = raw_params_adam[0].exp().detach().numpy().item()
phi2_adam = raw_params_adam[1].exp().detach().numpy().item()
phi3_adam = raw_params_adam[2].exp().detach().numpy().item()
fitted_nugget_adam = raw_params_adam[3].exp().detach().numpy().item() 

fitted_sigma2_adam = phi1_adam / (phi2_adam + 1e-6)
fitted_range_a_adam = 1.0 / (phi2_adam + 1e-6)
fitted_ratio_adam = np.sqrt(phi3_adam)
nll_final_adam = final_loss_adam.item()

print("\nüöÄ PyTorch Adam Results (Log-Stable Reparameterization):")
print(f"  * Fitted Variance (œÉ¬≤): {fitted_sigma2_adam:.3f} (Target: {SIGMA2_TRUE})")
print(f"  * Fitted Range (a): {fitted_range_a_adam:.3f} (Target: {RANGE_A_TRUE})")
print(f"  * Fitted Anisotropy (Œ∏‚ÇÉ-ratio): {fitted_ratio_adam:.3f} (Target: {ANISOTROPY_RATIO_TRUE})")
print(f"  * Fitted Nugget (Œ∑¬≤): {fitted_nugget_adam:.3f} (Target: {NUGGET_TRUE})") 
print(f"  ---")
print(f"  * Fitted Œ∏‚ÇÅ (œÉ¬≤/a): {phi1_adam:.3f} (Target: {PHI1_TARGET:.3f})")
print(f"  * Fitted Œ∏‚ÇÇ (1/a): {phi2_adam:.3f} (Target: {PHI2_TARGET:.3f})")
print(f"  * Fitted œÜ‚ÇÉ (Œ∏‚ÇÉ¬≤): {phi3_adam:.3f} (Target: {PHI3_TARGET:.3f})")
print(f"  ---")
print(f"  * Final -LL Value: {nll_final_adam:.2f}")

# --- üí° NEW: Expanded Sanity Check for Adam ---
print(f"  * --- Sanity Check (Adam) ---")
# Check œÉ¬≤
s2_plus = check_nll_from_interpretable(fitted_sigma2_adam + 1.0, fitted_range_a_adam, fitted_ratio_adam, fitted_nugget_adam, d_lon_sq_torch, d_lat_sq_torch, z_centered_torch)
s2_minus = check_nll_from_interpretable(fitted_sigma2_adam - 1.0, fitted_range_a_adam, fitted_ratio_adam, fitted_nugget_adam, d_lon_sq_torch, d_lat_sq_torch, z_centered_torch)
print(f"  * NLL @ œÉ¬≤_hat + 1.0: {s2_plus:.2f} (Change: {s2_plus - nll_final_adam:.2f})")
print(f"  * NLL @ œÉ¬≤_hat - 1.0: {s2_minus:.2f} (Change: {s2_minus - nll_final_adam:.2f})")
# Check Range
a_plus = check_nll_from_interpretable(fitted_sigma2_adam, fitted_range_a_adam + 0.1, fitted_ratio_adam, fitted_nugget_adam, d_lon_sq_torch, d_lat_sq_torch, z_centered_torch)
a_minus = check_nll_from_interpretable(fitted_sigma2_adam, fitted_range_a_adam - 0.1, fitted_ratio_adam, fitted_nugget_adam, d_lon_sq_torch, d_lat_sq_torch, z_centered_torch)
print(f"  * NLL @ a_hat + 0.1: {a_plus:.2f} (Change: {a_plus - nll_final_adam:.2f})")
print(f"  * NLL @ a_hat - 0.1: {a_minus:.2f} (Change: {a_minus - nll_final_adam:.2f})")
# Check Anisotropy
r_plus = check_nll_from_interpretable(fitted_sigma2_adam, fitted_range_a_adam, fitted_ratio_adam + 0.1, fitted_nugget_adam, d_lon_sq_torch, d_lat_sq_torch, z_centered_torch)
r_minus = check_nll_from_interpretable(fitted_sigma2_adam, fitted_range_a_adam, fitted_ratio_adam - 0.1, fitted_nugget_adam, d_lon_sq_torch, d_lat_sq_torch, z_centered_torch)
print(f"  * NLL @ Œ∏‚ÇÉ_hat + 0.1: {r_plus:.2f} (Change: {r_plus - nll_final_adam:.2f})")
print(f"  * NLL @ Œ∏‚ÇÉ_hat - 0.1: {r_minus:.2f} (Change: {r_minus - nll_final_adam:.2f})")
# Check Nugget
n_plus = check_nll_from_interpretable(fitted_sigma2_adam, fitted_range_a_adam, fitted_ratio_adam, fitted_nugget_adam + 0.1, d_lon_sq_torch, d_lat_sq_torch, z_centered_torch)
n_minus = check_nll_from_interpretable(fitted_sigma2_adam, fitted_range_a_adam, fitted_ratio_adam, fitted_nugget_adam - 0.1, d_lon_sq_torch, d_lat_sq_torch, z_centered_torch)
print(f"  * NLL @ Œ∑¬≤_hat + 0.1: {n_plus:.2f} (Change: {n_plus - nll_final_adam:.2f})")
print(f"  * NLL @ Œ∑¬≤_hat - 0.1: {n_minus:.2f} (Change: {n_minus - nll_final_adam:.2f})")

print(f"  * Optimization Steps: {ADAM_ITERATIONS} epochs")
print("="*75)

--- Starting Data Generation (Anisotropic) ---
--- Data Generation Complete ---

--- A. Starting MLE Optimization (PyTorch L-BFGS) - LOG-STABLE ---
LBFGS Step 5/50, NLL: 1712.44, Params: [œÉ¬≤: 37.698, a: 1.468, Œ∏‚ÇÉ-ratio: 2.065, Œ∑¬≤: 2.677], Grads: [logŒ¶1: -0.0004, logŒ¶2: -0.0000, logŒ¶3: 0.0000, logŒó¬≤: -0.0002]
LBFGS Step 10/50, NLL: 1712.44, Params: [œÉ¬≤: 37.698, a: 1.468, Œ∏‚ÇÉ-ratio: 2.065, Œ∑¬≤: 2.677], Grads: [logŒ¶1: -0.0004, logŒ¶2: -0.0000, logŒ¶3: 0.0000, logŒó¬≤: -0.0002]
LBFGS Step 15/50, NLL: 1712.44, Params: [œÉ¬≤: 37.698, a: 1.468, Œ∏‚ÇÉ-ratio: 2.065, Œ∑¬≤: 2.677], Grads: [logŒ¶1: -0.0004, logŒ¶2: -0.0000, logŒ¶3: 0.0000, logŒó¬≤: -0.0002]
LBFGS Step 20/50, NLL: 1712.44, Params: [œÉ¬≤: 37.698, a: 1.468, Œ∏‚ÇÉ-ratio: 2.065, Œ∑¬≤: 2.677], Grads: [logŒ¶1: -0.0004, logŒ¶2: -0.0000, logŒ¶3: 0.0000, logŒó¬≤: -0.0002]
LBFGS Step 25/50, NLL: 1712.44, Params: [œÉ¬≤: 37.698, a: 1.468, Œ∏‚ÇÉ-ratio: 2.065, Œ∑¬≤: 2.677], Grads: [logŒ¶1: -0.0004, logŒ¶2: -0.0000, logŒ¶3: 0.00