In [2]:
# Standard libraries
import sys
# Add your custom path
gems_tco_path = "/Users/joonwonlee/Documents/GEMS_TCO-1/src"
sys.path.append(gems_tco_path)
import logging
import argparse # Argument parsing

# Data manipulation and analysis
import pandas as pd
import numpy as np
import pickle
import torch
import torch.optim as optim
import copy                    # clone tensor
import time

# Custom imports
import GEMS_TCO
from GEMS_TCO import kernels

from GEMS_TCO import kernels 
from GEMS_TCO import orderings as _orderings 
from GEMS_TCO import load_data
from GEMS_TCO import alg_optimization, alg_opt_Encoder
from GEMS_TCO import configuration as config

from typing import Optional, List, Tuple
from pathlib import Path
import typer
import json
from json import JSONEncoder

from GEMS_TCO import configuration as config
from GEMS_TCO import data_preprocess as dmbh

import os
from sklearn.neighbors import BallTree

## Load data (but this version has latitude calibration)

In [2]:
# initialization
space = ['1,1']  # lat, lon resolution
days = ['0,31']
mm_cond_number = 20
lat_lon_resolution = [int(s) for s in space[0].split(',')]
days_s_e = list(map(int, days[0].split(',')))
print(days_s_e)

days_list = list(range(days_s_e[0], days_s_e[1]))
years = ['2024']
month_range =[7,8]
output_path = input_path = Path(config.mac_estimates_day_path)

## load ozone data from amarel
data_load_instance = load_data(config.mac_data_load_path)
df_map, ord_mm, nns_map= data_load_instance.load_mm20k_data_bymonthyear(lat_lon_resolution = lat_lon_resolution, mm_cond_number=mm_cond_number,years_=years, months_=month_range)  

# only fit spline once because space are all same
# load first data of analysis_data_map and aggregated_data to initialize spline_instance
first_day_idx_for_datamap= [0,8]
first_day_analysis_data_map, first_day_aggregated_data = data_load_instance.load_working_data_byday(df_map, ord_mm, nns_map, idx_for_datamap= first_day_idx_for_datamap)

[0, 31]


## Load data and undo latitude calibration

In [4]:
#keys = list(df_map.keys())
#df = df_map[keys[0]]

lon_s = 123
lon_e = 133
step_lat = 0.044
step_lon = 0.063

lat_coords = np.arange( 5 -0.044- 0.0002, 0 -0.044, -0.044)
lon_coords = np.arange( lon_e-step_lon- 0.0002, lon_s-step_lon, -step_lon)

# Apply the shift as in the original code
# These are the unique lat/lon values for the "center_points" grid
final_lat_values = lat_coords + step_lat 
final_lon_values = lon_coords + step_lon 

# Create 2D grid with broadcasting
#decrement = 0.00012
decrement = 0 
lat_grid = final_lat_values[:, None] + np.arange(len(final_lon_values)) * decrement  # shape: (228, 152)


mac_data_path = config.mac_data_load_path
years = [2024]  # years = [2023,2024]
months = list( range(7,8))
year = years[0]
month = months[0]
month_str = f"{month:02d}"  
filename = f"pickle_2024/orbit_map{str(year)[2:]}_{month_str}.pkl"
picklefile_path = Path(mac_data_path) / filename
print(picklefile_path)

with open(picklefile_path, 'rb') as pickle_file:
    data_map_hour = pickle.load(pickle_file)

# Base file path and settings
# base_path = "C:\\Users\\joonw\\TCO\\GEMS_data"    MSI notebook

mac_data_path = config.mac_data_load_path
lat_start, lat_end, lon_start, lon_end = 0, 5, 123, 133
step_lat, step_lon = 0.044, 0.063

# df = pd.read_csv("C:\\Users\\joonw\\TCO\\GEMS_data\\data_2024\\data_24_07_0131_N510_E110120.csv")  MSI notebook
df = pd.read_csv("/Users/joonwonlee/Documents/GEMS_DATA/data_2024/data_24_07_0131_N05_E123133.csv")  # MAC


/Users/joonwonlee/Documents/GEMS_DATA/pickle_2024/orbit_map24_07.pkl


In [5]:
class center_matching_hour():
    """
    Processes orbit data by averaging over specified spatial regions and resolutions.

    Parameters:
        df (pd.DataFrame): Input DataFrame containing the data.
        lat_s (int): Start latitude for spatial averaging.
        lat_e (int): End latitude for spatial averaging.
        lon_s (int): Start longitude for spatial averaging.
        lon_e (int): End longitude for spatial averaging.
        lat_resolution (Optional[float]): Latitude resolution for spatial bins. Default is None.
        lon_resolution (Optional[float]): Longitude resolution for spatial bins. Default is None.
    """
    def __init__(
        self, 
        df:pd.DataFrame=None, 
        lat_s:float =0,
        lat_e:float =5, 
        lon_s:float =123,
        lon_e:float =133, 
        lat_resolution:float=None, 
        lon_resolution:float =None
    ):
        # Input validation
        if df is not None:
            assert isinstance(df, pd.DataFrame), "df must be a pandas DataFrame"

        if lat_resolution is not None:
            assert isinstance(lat_resolution, float), "lat_resolution must be a float"
        if lon_resolution is not None:
            assert isinstance(lon_resolution, float), "lon_resolution must be a float"
        
        self.df = df
        self.lat_resolution = lat_resolution
        self.lon_resolution = lon_resolution
        self.lat_s = lat_s
        self.lat_e = lat_e
        self.lon_s = lon_s
        self.lon_e = lon_e

    def group_data_by_orbits(self):
        """
        Groups data into a dictionary based on unique orbit timestamps.

        Returns:
            dict: A dictionary where keys represent formatted orbit identifiers 
                and values are DataFrames corresponding to each orbit.
        """
        orbit_map = {}  
        self.df['Orbit'] = self.df['Time'].str[0:16]
        orbits = self.df['Orbit'].unique()
        for orbit in orbits:
            orbit_key = f'y{orbit[2:4]}m{int(orbit[5:7]):02d}day{ int(orbit[8:10]):02d}_hm{(orbit[11:16])}'
            orbit_map[orbit_key] = self.df.loc[self.df['Orbit'] == orbit].reset_index(drop=True)
        return orbit_map
    
    def make_center_points(self, step_lat:float=0.042, step_lon:float=0.062) -> pd.DataFrame:
        lat_coords = np.arange( self.lat_e-step_lat- 0.0002, self.lat_s -step_lat, -step_lat)
        lon_coords = np.arange( self.lon_e-step_lon- 0.0002, self.lon_s-step_lon, -step_lon)

        # Apply the shift as in the original code
        # These are the unique lat/lon values for the "center_points" grid
        final_lat_values = lat_coords + step_lat 
        final_lon_values = lon_coords + step_lon 
        
        # Create 2D grid with broadcasting
        #decrement = 0.00012
        decrement = 0
        lat_grid = final_lat_values[:, None] + np.arange(len(final_lon_values)) * decrement  # shape: (228, 152)

        # Flatten row-wise (C order)
        center_lats = lat_grid.flatten()

        # Create matching longitude grid
        center_lons = np.tile(final_lon_values, len(final_lat_values))

        # Now you can build your DataFrame
        center_points_df = pd.DataFrame({'lat': center_lats, 'lon': center_lons})
        return center_points_df
    

    '''  
    coarse_by_center   allows duplicates while coarse_by_center_unique doesnt.
    '''

    def coarse_by_center(self, orbit_map: dict, center_points: pd.DataFrame) -> dict:
        assert isinstance(orbit_map, dict), "orbit_map must be a dict"
        assert isinstance(center_points, pd.DataFrame), "center_points must be a pd.DataFrame"

        coarse_map = {}
        key_list = sorted(orbit_map)

        # Convert query points (lat, lon) to NumPy array
        query_points = center_points[['lat', 'lon']].to_numpy()
        query_points_rad = np.radians(query_points)  # if using haversine

        num_center_points = len(center_points)

        for key in key_list:
            cur_data = orbit_map[key].reset_index(drop=True)
            locs = cur_data[['Latitude', 'Longitude']].to_numpy()

            if locs.shape[0] == 0:
                coarse_map[key] = pd.DataFrame({
                    'Latitude': center_points['lat'],
                    'Longitude': center_points['lon'],
                    'ColumnAmountO3': [np.nan] * num_center_points,
                    'Hours_elapsed': [np.nan] * num_center_points,
                    'Time': [pd.NaT] * num_center_points,
                    'Source_Latitude': [np.nan] * num_center_points,
                    'Source_Longitude': [np.nan] * num_center_points
                })
                continue

            # Use haversine
            locs_rad = np.radians(locs)
            tree = BallTree(locs_rad, metric='haversine')
            dist, ind = tree.query(query_points_rad, k=1)

            nearest_indices = ind.flatten()

            # Extract values from the nearest source points
            res_o3_values = cur_data.loc[nearest_indices, 'ColumnAmountO3'].values
            source_lat = cur_data.loc[nearest_indices, 'Latitude'].values
            source_lon = cur_data.loc[nearest_indices, 'Longitude'].values

            hours_elapsed_val = cur_data['Hours_elapsed'].iloc[0] if not cur_data.empty else np.nan
            time_val = cur_data['Time'].iloc[0] if not cur_data.empty else pd.NaT

            coarse_map[key] = pd.DataFrame({
                'Latitude': center_points['lat'].values,
                'Longitude': center_points['lon'].values,
                'ColumnAmountO3': res_o3_values,
                'Hours_elapsed': [hours_elapsed_val] * num_center_points,
                'Time': [time_val] * num_center_points,
                'Source_Latitude': source_lat,
                'Source_Longitude': source_lon
            })
        return coarse_map

In [5]:
instance = center_matching_hour(df, lat_start, lat_end, lon_start, lon_end)  

for year in years:        # years = [2023,2024]
    for month in months:  
        try:
            # load pickle (dense ORI data)
            pickle_path = os.path.join(mac_data_path, f'pickle_{year}')
            input_filename = f"orbit_map{str(year)[2:]}_{month_str}.pkl"
            input_filepath = os.path.join(pickle_path, input_filename)
            with open(input_filepath, 'rb') as pickle_file:
                loaded_map = pickle.load(pickle_file)
            center_points = instance.make_center_points(step_lat = step_lat, step_lon= step_lon)
            coarse_cen_map = instance.coarse_by_center(loaded_map, center_points)

            # Save pickle (coarse data)
            output_filename = f"coarse_cen_map_without_decrement_latitude{str(year)[2:]}_{month_str}.pkl"
            output_filepath = os.path.join(pickle_path, output_filename)
            with open(output_filepath, 'wb') as pickle_file:
                pickle.dump(coarse_cen_map, pickle_file)
            
            print(f"Successfully processed and saved data for year {str(year)[2:]} month {month_str}.")
        except FileNotFoundError:
            print(f"Warning: File {input_filename} not found. Skipping.")
        except Exception as e:
            print(f"Error processing file {input_filename}: {e}")

Successfully processed and saved data for year 24 month 07.


In [6]:
pickle_path = os.path.join(mac_data_path, f'pickle_{year}')
output_filename = f"coarse_cen_map_without_decrement_latitude{str(year)[2:]}_{month_str}.pkl"
output_filepath = os.path.join(pickle_path, output_filename)

with open(output_filepath, 'rb') as pickle_file:
    cbmap_ori = pickle.load(pickle_file)

day1_df_ori = pd.concat(cbmap_ori.values(), axis=0, ignore_index=True)


## Compute C_{g,n}(u)

In [7]:
import numpy as np
import pandas as pd
from scipy.signal import convolve2d

def apply_laplacian_2d_valid(df: pd.DataFrame) -> pd.DataFrame:
    """
    Applies a 2D discrete Laplacian filter using 'mode=valid', returning a new,
    smaller DataFrame with the cropped results and corresponding coordinates.

    Args:
        df (pd.DataFrame): A DataFrame for a single time slice, containing
                           'ColumnAmountO3', 'Latitude', and 'Longitude' columns.

    Returns:
        pd.DataFrame: A new, smaller DataFrame containing the 'laplacian' values
                      and the corresponding 'Latitude', 'Longitude', and 'Time'.
    """
    # Dynamically determine the grid shape and unique coordinates
    unique_lats = df['Latitude'].unique()
    unique_lons = df['Longitude'].unique()
    lat_count = len(unique_lats)
    lon_count = len(unique_lons)
    
    if df.shape[0] != lat_count * lon_count:
        raise ValueError("DataFrame size does not match the grid dimensions (lat * lon).")

    # Reshape the data based on its flattening order (lat slowest, lon fastest)
    grid = df['ColumnAmountO3'].values.reshape((lat_count, lon_count))

    # Define the standard 3x3 discrete Laplacian kernel
    laplacian_kernel = np.array([[0, 1, 0],
                                 [1, -4, 1],
                                 [0, 1, 0]])

    # Apply convolution with 'valid' mode, which crops the boundaries.
    laplacian_grid_cropped = convolve2d(grid, laplacian_kernel, mode='valid')
    
    # Determine the new, cropped coordinates. A 3x3 kernel removes one
    # element from each of the four sides.
    cropped_lats = unique_lats[1:-1]
    cropped_lons = unique_lons[1:-1]

    # If cropping results in an empty grid, return an empty DataFrame
    if len(cropped_lats) == 0 or len(cropped_lons) == 0:
        return pd.DataFrame()

    # Create a new meshgrid for the cropped coordinates
    new_lon_grid, new_lat_grid = np.meshgrid(cropped_lons, cropped_lats)
    
    # Create the new DataFrame with the filtered data and new coordinates
    new_df = pd.DataFrame({
        'Latitude': new_lat_grid.flatten(),
        'Longitude': new_lon_grid.flatten(),
        'laplacian': laplacian_grid_cropped.flatten()
    })
    
    # Preserve the timestamp from the original slice if it exists
    if 'Time' in df.columns:
        new_df['Time'] = df['Time'].iloc[0]
        
    new_df['Hours_elapsed'] = df['Hours_elapsed'].iloc[0]

    return new_df

def subset(df):
    """Subsets the DataFrame to a specific lat/lon range, as provided."""
    tmp = df['Longitude'].unique().copy()
    tmp2 = tmp[(tmp >= 123.5) & (tmp <= 132)]
    lon_cut = tmp2[5:]
    t = df['Latitude'].unique().copy()
    t2 = t[(t >= 0) & (t <= 3)]
    lat_cut = t2[2:]
    # Ensure that lat_cut and lon_cut are not empty to avoid errors
    if len(lon_cut) == 0 or len(lat_cut) == 0:
        return pd.DataFrame(columns=df.columns) # Return empty DF if no data
    df_sub = df[(df['Longitude'].isin(lon_cut)) & (df['Latitude'].isin(lat_cut))].reset_index(drop=True)
    return df_sub

# --- Example Usage ---
if __name__ == '__main__':


    a = list(cbmap_ori.keys())
    
    processed_dfs = []
    
    for k in a[:8]:
        print(f"  Processing key: {k}")
        
        cur = subset(cbmap_ori[k])
        
        if not cur.empty:
            # The function now returns a new, smaller DataFrame with the filtered data
            filtered_df = apply_laplacian_2d_valid(cur)
            if not filtered_df.empty:
                processed_dfs.append(filtered_df)

    # 3. Concatenate all processed (and now smaller) DataFrames
    if processed_dfs:
        final_df = pd.concat(processed_dfs, axis=0, ignore_index=True)
        
        print("\n--- Processing Complete ---")
        print("Final DataFrame shape:", final_df.shape)
        print("Columns:", final_df.columns)
        print("\nHead of the final DataFrame:")
        print(final_df.head())
        print("\nTail of the final DataFrame:")
        print(final_df.tail())
    else:
        print("\nProcessing resulted in an empty DataFrame.")

  Processing key: y24m07day01_hm00:52
  Processing key: y24m07day01_hm01:52
  Processing key: y24m07day01_hm02:52
  Processing key: y24m07day01_hm03:52
  Processing key: y24m07day01_hm04:48
  Processing key: y24m07day01_hm05:48
  Processing key: y24m07day01_hm06:48
  Processing key: y24m07day01_hm07:48

--- Processing Complete ---
Final DataFrame shape: (65536, 5)
Columns: Index(['Latitude', 'Longitude', 'laplacian', 'Time', 'Hours_elapsed'], dtype='object')

Head of the final DataFrame:
   Latitude  Longitude  laplacian                 Time  Hours_elapsed
0    2.8438   131.6138   -6.56543  2024-07-01 00:52:00  477720.866667
1    2.8438   131.5508   -3.03646  2024-07-01 00:52:00  477720.866667
2    2.8438   131.4878   20.61612  2024-07-01 00:52:00  477720.866667
3    2.8438   131.4248   -4.31279  2024-07-01 00:52:00  477720.866667
4    2.8438   131.3618   -1.02134  2024-07-01 00:52:00  477720.866667

Tail of the final DataFrame:
       Latitude  Longitude  laplacian                 Tim

In [70]:
times = final_df['Hours_elapsed'].unique()
cbmap = {}
for t in range(len(times)):
    cbmap[t] = final_df[final_df['Hours_elapsed'] == times[t]].reset_index(drop=True)

cbmap[7]

Unnamed: 0,Latitude,Longitude,laplacian,Time,Hours_elapsed
0,2.8438,131.6138,2.77747,2024-07-01 07:48:00,477727.8
1,2.8438,131.5508,-15.47635,2024-07-01 07:48:00,477727.8
2,2.8438,131.4878,-1.04339,2024-07-01 07:48:00,477727.8
3,2.8438,131.4248,0.72936,2024-07-01 07:48:00,477727.8
4,2.8438,131.3618,-5.38342,2024-07-01 07:48:00,477727.8
...,...,...,...,...,...
8187,0.0718,123.8648,2.81470,2024-07-01 07:48:00,477727.8
8188,0.0718,123.8018,1.03326,2024-07-01 07:48:00,477727.8
8189,0.0718,123.7388,-3.56801,2024-07-01 07:48:00,477727.8
8190,0.0718,123.6758,-3.82671,2024-07-01 07:48:00,477727.8


# 2D SPACE

In [None]:
import numpy as np
import cmath
import pandas as pd
import time

def cgn(u):
    """
    Computes a 2D Bartlett window function (triangular window).
    
    Args:
        u (tuple): A tuple of lag indices (u1, u2).
        
    Returns:
        float: The window value.
    """
    u1, u2 = u
    return (1 - np.abs(u1) / 64) * (1 - np.abs(u2) / 128) 

def cov_x(u1, u2, params):
    """
    Computes the autocovariance of the original process.
    
    Args:
        u1 (int): The first lag index.
        u2 (int): The second lag index.
        params (list): A list of parameters for the covariance function.
                       Example: [sigma2, alpha1, alpha2].
        
    Returns:
        float: The autocovariance value.
    """
    sigma2, alpha1, alpha2 = params
    return sigma2 * np.exp(-np.sqrt((u1 / alpha1)**2 + (u2 / alpha2)**2))

def cov_laplacian(u1, u2, params):
    """
    Computes the autocovariance of the Laplacian-filtered process.
    
    Args:
        u1 (int): The first lag index.
        u2 (int): The second lag index.
        params (list): A list of parameters for the covariance function.
        
    Returns:
        float: The autocovariance value of the filtered process.
    """
    delta1, delta2 = 0.044, 0.063
    
    # Define the 5-point stencil of the discrete Laplacian
    stencil_weights = {(0, 0): -4, (0, 1): 1, (0, -1): 1, (1, 0): 1, (-1, 0): 1}
    
    cov = 0
    # Iterate through all pairs of points in the stencil
    for (a, b), w_ab in stencil_weights.items():
        for (c, d), w_cd in stencil_weights.items():
            # Calculate the effective lag vector
            lag_x = (u1 + a - c) * delta1
            lag_y = (u2 + b - d) * delta2
            
            # Add the weighted covariance term
            cov += w_ab * w_cd * cov_x(lag_x, lag_y, params)
            
    return cov

def cn_bar(u1, u2, params):
    """
    Computes the periodicized autocovariance by multiplying the
    Laplacian covariance with a 2D Bartlett window.
    
    Args:
        u1 (int): The first lag index.
        u2 (int): The second lag index.
        params (list): Model parameters.
        
    Returns:
        float: The periodicized and windowed autocovariance value.
    """
    u = (u1, u2)
    return cov_laplacian(u1, u2, params) * cgn(u)


def expected_periodogram_fft(params, n1, n2, delta1, delta2):
    """
    Computes the expected periodogram for ALL frequencies using a 2D FFT.
    This method is much faster for a full grid of frequencies.
    
    Args:
        params (list): Model parameters.
        n1 (int): The number of samples in the first dimension.
        n2 (int): The number of samples in the second dimension.
        delta1 (float): The sampling interval in the first dimension.
        delta2 (float): The sampling interval in the second dimension.
        
    Returns:
        np.ndarray: A 2D array of expected periodogram values for all frequencies.
    """
    cn_tilde_matrix = np.zeros((n1, n2), dtype=complex)
    
    for u1 in range(n1):
        for u2 in range(n2):
            cn_tilde_matrix[u1, u2] = cn_bar(u1, u2, params) + \
                                      cn_bar(u1 - n1, u2 - n2, params) + \
                                      cn_bar(u1, u2 - n2, params) + \
                                      cn_bar(u1 - n1, u2, params)
    
    fft_result = np.fft.fft2(cn_tilde_matrix)
    
    normalization_factor = (delta1 * delta2) / (2 * cmath.pi)**2
    expected_periodogram = fft_result * normalization_factor
    
    return expected_periodogram

def compute_2d_periodogram_from_df(df, value_column='laplacian', lat_column='Latitude', lon_column='Longitude'):
    """
    Computes the 2D periodogram from a pandas DataFrame containing spatial data.

    Args:
        df (pd.DataFrame): The input DataFrame.
        value_column (str): The name of the column containing the data values (e.g., 'laplacian').
        lat_column (str): The name of the column for the row index (e.g., 'Latitude').
        lon_column (str): The name of the column for the column headers (e.g., 'Longitude').

    Returns:
        np.ndarray: A 2D NumPy array of the periodogram values.
    """
    # 1. Pivot the DataFrame to reshape the 1D series into a 2D grid.
    # The `lat_column` is used as the index and `lon_column` as the columns
    # because Latitude typically represents the y-axis (rows) and Longitude the x-axis (columns).
    # Since Latitude changes more slowly, it makes sense to use it as the index.
    data_grid = df.pivot_table(index=lat_column, columns=lon_column, values=value_column)
    
    # 2. Convert the 2D pandas DataFrame to a 2D NumPy array.
    data_array = data_grid.values
    
    # 3. Compute the 2D FFT.
    # The number of rows and columns in the array.
    n1, n2 = data_array.shape
    
    # The `np.fft.fft2` function is used for a 2D FFT.
    fft_result = np.fft.fft2(data_array)
    
    # 4. Calculate the periodogram.
    # The periodogram is the squared magnitude of the FFT result, normalized by the number of samples.
    periodogram = (np.abs(fft_result)**2) / (n1 * n2)
    
    return periodogram



def likelihood(params, df):
    periodogram_values = compute_2d_periodogram_from_df(df)
    n1, n2 = periodogram_values.shape
    delta1, delta2 = 0.044, 0.063
    n = n1*n2
    # Ensure the expected periodogram's frequency order matches the data's periodogram
    expected_periodogram_values = expected_periodogram_fft(params, n1, n2, delta1, delta2)
    
    # Flatten both periodograms for easier computation
    periodogram_flat = periodogram_values.flatten()
    expected_flat = expected_periodogram_values.flatten()
    
    # Use the real part and ensure it's non-negative for the log-likelihood
    expected_flat_real = np.maximum(expected_flat.real, 1e-10)
    
    # Compute the negative log-likelihood using the real-valued expected periodogram
    nll = np.sum(np.log(expected_flat_real) + periodogram_flat / expected_flat_real)
    return nll/n

params = [20, 0.5, 0.5]  # Example parameters: [sigma2, alpha1, alpha2]
a = likelihood(params, df1)
print(a)

periodogram_values = compute_2d_periodogram_from_df(df1)
periodogram_values


55636.26023825511


array([[0.28387959, 0.09838602, 0.22251661, ..., 0.33407337, 0.22251661,
        0.09838602],
       [0.12220644, 0.27996254, 0.6458887 , ..., 0.49251526, 1.67686008,
        0.01284934],
       [0.24987992, 0.12154554, 0.19677225, ..., 1.39454568, 0.57255071,
        0.15569056],
       ...,
       [1.08069075, 0.26650067, 0.78651477, ..., 2.68740763, 0.47185816,
        4.24604453],
       [0.24987992, 0.15569056, 0.57255071, ..., 0.32074295, 0.19677225,
        0.12154554],
       [0.12220644, 0.01284934, 1.67686008, ..., 0.79192762, 0.6458887 ,
        0.27996254]])

# 3 D SPACE

In [89]:
df_list = []
for i in range(8):
    df_list.append(cbmap[i])

In [None]:
import numpy as np
import cmath
import pandas as pd
import time
import torch


def cgn(u):
    """
    Computes a 2D Bartlett window function (triangular window).
    
    Args:
        u (tuple): A tuple of lag indices (u1, u2).
        
    Returns:
        float: The window value.
    """
    u1, u2 = u
    return (1 - np.abs(u1) / 64) * (1 - np.abs(u2) / 128) 

def cov_x(u1, u2, t, params):
    sigmasq, range_lat, range_lon, advec_lat, advec_lon, beta, nugget = params
    
    distance = (u1/range_lat - advec_lat * t)**2 + (u2/range_lon - advec_lon * t)**2 + (beta * t)**2

    if distance != 0:
        return sigmasq * torch.exp(- torch.sqrt(distance))
    else:
        return sigmasq + nugget
    

def cov_laplacian(u1, u2, t, params):
    """
    Computes the autocovariance of the Laplacian-filtered process.
    
    Args:
        u1 (int): The first lag index.
        u2 (int): The second lag index.
        params (list): A list of parameters for the covariance function.
        
    Returns:
        float: The autocovariance value of the filtered process.
    """
    delta1, delta2 = 0.044, 0.063
    
    # Define the 5-point stencil of the discrete Laplacian
    stencil_weights = {(0, 0): -4, (0, 1): 1, (0, -1): 1, (1, 0): 1, (-1, 0): 1}
    
    cov = 0
    # Iterate through all pairs of points in the stencil
    for (a, b), w_ab in stencil_weights.items():
        for (c, d), w_cd in stencil_weights.items():
            # Calculate the effective lag vector
            lag_x = (u1 + a - c) * delta1
            lag_y = (u2 + b - d) * delta2
            
            # Add the weighted covariance term
            cov += w_ab * w_cd * cov_x(lag_x, lag_y, t, params)
            
    return cov

def cn_bar(u1, u2, t,params):
    """
    Computes the periodicized autocovariance by multiplying the
    Laplacian covariance with a 2D Bartlett window.
    
    Args:
        u1 (int): The first lag index.
        u2 (int): The second lag index.
        params (list): Model parameters.
        
    Returns:
        float: The periodicized and windowed autocovariance value.
    """
    u = (u1, u2)
    return cov_laplacian(u1, u2, t, params) * cgn(u)

def expected_periodogram_fft_matrix(params, n1, n2, delta1, delta2):
    # Create a 4D tensor to hold the term c_g,n * c_X
    # Shape will be (n1, n2, p, p)

    p=8
    product_tensor = np.zeros((n1, n2, p, p), dtype=complex)

    # Define time lags based on the number of components
    t_lags = np.arange(p)  # e.g., [0, 1, 2, ..., 7]
    
    # Assuming g_q and g_r are the same for all components, this simplifies to cgn(u)
    # The normalization constant for c_g,n is tricky, but let's assume it's included in cgn
    
    for u1 in range(n1):
        for u2 in range(n2):
            for q in range(p):
                for r in range(p):
                    # Temporal lag
                    t = t_lags[q] - t_lags[r]
                    
                    # Compute the windowed autocovariance product c_g,n * c_X
                    # The formula in the paper is a sum over u, so we're building the term for each u
                    product_tensor[u1, u2, q, r] = cgn((u1, u2), n1, n2) * cov_laplacian(u1, u2, t, params)
                    
    # Perform the 2D FFT on the spatial dimensions for each component pair
    # The FFT is applied to each p x p matrix for each u1, u2 combination
    # The result will be (n1, n2, p, p)
    fft_result = np.fft.fft2(product_tensor, axes=(0, 1))

    # Normalization factor from the paper
    normalization_factor = (delta1 * delta2) / (2 * cmath.pi)**2
    
    expected_periodogram_tensor = fft_result * normalization_factor

    return np.fft.fftshift(expected_periodogram_tensor, axes=(0, 1))


def compute_2d_multivariate_periodogram_from_df_list(df_list, value_column='laplacian', lat_column='Latitude', lon_column='Longitude'):
    """
    Computes the 2D multivariate periodogram tensor from a list of pandas DataFrames.
    
    Args:
        df_list (list): A list of DataFrames, where each DataFrame represents one
                        multivariate component.
        value_column (str): The name of the column containing the data values (e.g., 'laplacian').
        lat_column (str): The name of the column for the row index (e.g., 'Latitude').
        lon_column (str): The name of the column for the column headers (e.g., 'Longitude').
        
    Returns:
        np.ndarray: A 4D NumPy array (n1, n2, p, p) of cross-periodogram matrices.
    """
    p = len(df_list)
    
    # 1. Pivot each DataFrame and compute its 2D FFT
    fft_results = []
    n1, n2 = 0, 0
    for df in df_list:
        data_grid = df.pivot_table(index=lat_column, columns=lon_column, values=value_column)
        data_array = data_grid.values
        n1, n2 = data_array.shape
        fft_results.append(np.fft.fft2(data_array))
        
    # 2. Create a 4D tensor for the cross-periodograms
    cross_periodogram_tensor = np.zeros((n1, n2, p, p), dtype=complex)
    
    # 3. Compute the cross-periodogram for each pair of components
    for q in range(p):
        for r in range(p):
            # I(qr)n(ω) = J(q)(ω) * J(r)∗(ω)
            cross_periodogram_tensor[:, :, q, r] = (fft_results[q] * np.conj(fft_results[r])) / (n1 * n2)
            
    # 4. Apply fftshift to match the frequency order of the expected periodogram
    return np.fft.fftshift(cross_periodogram_tensor, axes=(0, 1))



def generate_Jvector(df_list, value_column='laplacian', lat_column='Latitude', lon_column='Longitude'):
    """
    Generates a 3D tensor of DFT vectors from a list of DataFrames.
    
    Args:
        df_list (list): A list of DataFrames, where each DataFrame represents one
                        multivariate component.
        value_column (str): The name of the column containing the data values (e.g., 'laplacian').
        lat_column (str): The name of the column for the row index (e.g., 'Latitude').
        lon_column (str): The name of the column for the column headers (e.g., 'Longitude').
        
    Returns:
        np.ndarray: A 3D NumPy array (n1, n2, p) of DFT vectors for all frequencies.
    """
    p = len(df_list)
    n1, n2 = 0, 0
    fft_results = []
    
    for df in df_list:
        data_grid = df.pivot_table(index=lat_column, columns=lon_column, values=value_column)
        data_array = data_grid.values
        n1, n2 = data_array.shape
        # Perform 2D FFT for each component
        fft_results.append(np.fft.fft2(data_array))
    
    # Stack the 2D FFT results into a 3D tensor (n1, n2, p)
    J_vector_tensor = np.stack(fft_results, axis=2)
    
    # Return the shifted tensor to match the frequency order
    return np.fft.fftshift(J_vector_tensor, axes=(0, 1))


def likelihood(params, df_list):
    periodogram_values = compute_2d_multivariate_periodogram_from_df_list(df_list)
    n1, n2 = periodogram_values.shape
    delta1, delta2 = 0.044, 0.063
    n = n1*n2
    # Ensure the expected periodogram's frequency order matches the data's periodogram
    expected_periodogram_values = expected_periodogram_fft_matrix(params, n1, n2, delta1, delta2)
    j_vector = generate_Jvector(df_list)
    tmp = np.logdet(expected_periodogram_values) +   J^H @ np.linalg.inv(expected_periodogram_values) @ J

    # Compute the negative log-likelihood using the real-valued expected periodogram
    nll = np.sum(tmp)
    return nll/n

# 3d from here

In [131]:
import numpy as np
import cmath
import pandas as pd
import time
import torch

def cgn(u, n1, n2):
    """
    Computes a 2D Bartlett window function (triangular window).
    
    Args:
        u (tuple): A tuple of lag indices (u1, u2) as torch.Tensors.
        n1 (int): The number of samples in the first dimension.
        n2 (int): The number of samples in the second dimension.
        
    Returns:
        torch.Tensor: The window value.
    """
    u1, u2 = u
    # Use torch operations for element-wise calculation
    return (1 - torch.abs(u1) / n1) * (1 - torch.abs(u2) / n2) 

def cov_x(u1, u2, t, params):
    """
    Computes the spatio-temporal autocovariance of the original process.
    
    Args:
        u1 (torch.Tensor): The first spatial lag.
        u2 (torch.Tensor): The second spatial lag.
        t (torch.Tensor): The temporal lag.
        params (list): A list of parameters for the covariance function.
        
    Returns:
        torch.Tensor: The autocovariance value.
    """
    sigmasq, range_lat, range_lon, advec_lat, advec_lon, beta, nugget = params
    
    distance = (u1 / range_lat - advec_lat * t)**2 + (u2 / range_lon - advec_lon * t)**2 + (beta * t)**2

    # Using torch.where for conditional logic on tensors
    # This avoids issues with distance == 0
    return torch.where(distance != 0, sigmasq * torch.exp(-torch.sqrt(distance)), sigmasq + nugget)

def cov_laplacian(u1, u2, t, params):
    """
    Computes the autocovariance of the Laplacian-filtered process.
    
    Args:
        u1 (torch.Tensor): The first lag index.
        u2 (torch.Tensor): The second lag index.
        t (torch.Tensor): The temporal lag.
        params (list): A list of parameters for the covariance function.
        
    Returns:
        torch.Tensor: The autocovariance value of the filtered process.
    """
    delta1, delta2 = 0.044, 0.063
    
    # Define the 5-point stencil of the discrete Laplacian
    stencil_weights = {(0, 0): -4, (0, 1): 1, (0, -1): 1, (1, 0): 1, (-1, 0): 1}
    
    # Initialize cov as a tensor of the correct shape to handle broadcasting
    cov = torch.zeros_like(u1)
    # Iterate through all pairs of points in the stencil
    for (a, b), w_ab in stencil_weights.items():
        for (c, d), w_cd in stencil_weights.items():
            # Calculate the effective lag vector
            lag_x = (u1 + a - c) * delta1
            lag_y = (u2 + b - d) * delta2
            
            # Add the weighted covariance term
            cov += w_ab * w_cd * cov_x(lag_x, lag_y, t, params)
            
    return cov

def cn_bar(u1, u2, t, params, n1, n2):
    """
    Computes the periodicized autocovariance by multiplying the
    Laplacian covariance with a 2D Bartlett window.
    
    Args:
        u1 (torch.Tensor): The first lag index.
        u2 (torch.Tensor): The second lag index.
        t (torch.Tensor): The temporal lag.
        params (list): Model parameters.
        n1 (int): The number of samples in the first dimension.
        n2 (int): The number of samples in the second dimension.
        
    Returns:
        torch.Tensor: The periodicized and windowed autocovariance value.
    """
    # u1 and u2 are now tensors
    u = (u1, u2)
    return cov_laplacian(u1, u2, t, params) * cgn(u, n1, n2)

def expected_periodogram_fft_multivariate(params, n1, n2, p):
    """
    Computes the multivariate expected periodogram for ALL frequencies using a 2D FFT.
    This method is much faster than the direct summation.
    
    Args:
        params (list): Model parameters.
        n1 (int): The number of samples in the first spatial dimension.
        n2 (int): The number of samples in the second dimension.
        p (int): The number of multivariate components.
        
    Returns:
        torch.Tensor: A 4D tensor (n1, n2, p, p) of expected periodogram matrices.
    """
    delta1, delta2 = 0.044, 0.063
    
    # Create a 4D tensor to hold the term c_g,n * c_X
    # Shape will be (n1, n2, p, p)
    product_tensor = torch.zeros((n1, n2, p, p), dtype=torch.complex64)
    
    # Define time lags based on the number of components
    t_lags = torch.arange(p, dtype=torch.float32)
    
    # Using torch.meshgrid to create tensors for u1 and u2 to enable vectorized operations
    u1_mesh, u2_mesh = torch.meshgrid(torch.arange(n1, dtype=torch.float32), torch.arange(n2, dtype=torch.float32), indexing='ij')
    
    for q in range(p):
        for r in range(p):
            # Temporal lag
            t = t_lags[q] - t_lags[r]
            
            # Compute the windowed autocovariance product c_g,n * c_X
            # The formula in the paper is a sum over u, so we're building the term for each u
            product_tensor[:, :, q, r] = cn_bar(u1_mesh, u2_mesh, t, params, n1, n2)
            
    # Perform the 2D FFT on the spatial dimensions for each component pair
    fft_result = torch.fft.fft2(product_tensor, dim=(-4, -3))
    # fft_result shape is (n1, n2, p, p)

    # Normalization factor from the paper
    normalization_factor = (delta1 * delta2) / (2 * cmath.pi)**2

    # delta 1 and delta 2 are dx1 dx2 in continuous integral. 
    
    expected_periodogram_tensor = fft_result * normalization_factor

    # Use torch.fft.fftshift
    return expected_periodogram_tensor



In [1]:
def generate_Jvector(df_list, value_column='laplacian', lat_column='Latitude', lon_column='Longitude'):
    """
    Generates a 3D tensor of DFT vectors from a list of DataFrames.
    
    Args:
        df_list (list): A list of DataFrames, where each DataFrame represents one
                        multivariate component.
        value_column (str): The name of the column containing the data values (e.g., 'laplacian').
        lat_column (str): The name of the column for the row index (e.g., 'Latitude').
        lon_column (str): The name of the column for the column headers (e.g., 'Longitude').
        
    Returns:
        torch.Tensor: A 3D tensor (n1, n2, p) of DFT vectors for all frequencies.
    """
    p = len(df_list)
    fft_results = []
    
    for df in df_list:
        data_grid = df.pivot_table(index=lat_column, columns=lon_column, values=value_column)
        data_array = data_grid.values
        n1, n2 = data_array.shape
        # Convert numpy array to torch tensor
        data_tensor = torch.tensor(data_array, dtype=torch.float32)
        # Perform 2D FFT for each component
        fft_results.append(torch.fft.fft2(data_tensor))
    
    # Stack the 2D FFT results into a 3D tensor (n1, n2, p)
    J_vector_tensor = torch.stack(fft_results, dim=2)
    # Apply the normalization factor from the paper (2*pi)^(-d/2), where d=2
    # The normalization factor for the DFT vector is (2 * pi)^(-1)
    normalization_factor = 1.0 / (2 * cmath.pi)
    
    return J_vector_tensor * normalization_factor
def likelihood_vectorized(params, df_list):
    """
    Calculates the negative log-likelihood for the multivariate model
    using a vectorized approach for improved performance.
    
    Args:
        params (list): Model parameters.
        df_list (list): A list of DataFrames representing the multivariate data.
        
    Returns:
        float: The negative log-likelihood value.
    """
    p = len(df_list)
    if p == 0:
        return torch.tensor(0.0)
    
    # Determine dimensions from the first DataFrame
    n1, n2 = df_list[0].pivot_table(index='Latitude', columns='Longitude', values='laplacian').shape
    n = n1 * n2

    # Compute the expected periodogram tensor from the model
    expected_periodogram = expected_periodogram_fft_multivariate(params, n1, n2, p)
    
    # Generate the J-vector tensor from the data
    j_vector = generate_Jvector(df_list)
    
    # Reshape tensors for batch processing
    I_omega_batch = expected_periodogram.reshape(-1, p, p)
    J_omega_batch = j_vector.reshape(-1, p, 1)

    # Add a small value to the real part of the diagonal for stability
    # `torch.linalg.norm(I_omega_batch)` is used to ensure the regularization is relative to the matrix scale.
    I_omega_stable = torch.real(I_omega_batch) + torch.linalg.norm(I_omega_batch, dim=(1, 2), keepdim=True) * 1e-10 * torch.eye(p, dtype=I_omega_batch.dtype)

    # Compute the log-determinant of each matrix in the batch
    # The determinant of a complex matrix can be complex, so we take the log-abs-determinant.
    log_det = torch.log(torch.det(I_omega_stable)).real

    # Compute the inverse for each matrix in the batch
    inv_I_batch = torch.linalg.inv(I_omega_stable)
    
    # Compute the quadratic form: J^H * I^-1 * J
    # First, J^H @ inv_I (batch of 1 x p matrices @ batch of p x p matrices)
    term2_temp = torch.bmm(torch.conj(J_omega_batch.transpose(1, 2)), inv_I_batch)

    # Then, (J^H @ inv_I) @ J (batch of 1 x p matrices @ batch of p x 1 matrices)
    term2_batch = torch.bmm(term2_temp, J_omega_batch).reshape(-1) # reshape to get a 1D tensor

    # The sum of log(det(I)) and the trace term over all frequencies
    nll_batch = log_det + term2_batch.real

    # Sum up all the log-likelihood terms
    nll = torch.sum(nll_batch)

    # Normalize the total negative log-likelihood
    return nll / n

params = torch.tensor([20.0, 0.5, 0.5, 0.1, 0.1, 0.1, 0.01], requires_grad=True)  # Example parameters

a = likelihood(params, df_list)
print(a)


NameError: name 'torch' is not defined

In [None]:

def generate_Jvector(df_list, value_column='laplacian', lat_column='Latitude', lon_column='Longitude'):
    """
    Generates a 3D tensor of DFT vectors from a list of DataFrames.
    
    Args:
        df_list (list): A list of DataFrames, where each DataFrame represents one
                        multivariate component.
        value_column (str): The name of the column containing the data values (e.g., 'laplacian').
        lat_column (str): The name of the column for the row index (e.g., 'Latitude').
        lon_column (str): The name of the column for the column headers (e.g., 'Longitude').
        
    Returns:
        torch.Tensor: A 3D tensor (n1, n2, p) of DFT vectors for all frequencies.
    """
    p = len(df_list)
    fft_results = []
    
    for df in df_list:
        data_grid = df.pivot_table(index=lat_column, columns=lon_column, values=value_column)
        data_array = data_grid.values
        n1, n2 = data_array.shape
        # Convert numpy array to torch tensor
        data_tensor = torch.tensor(data_array, dtype=torch.float32)
        # Perform 2D FFT for each component
        fft_results.append(torch.fft.fft2(data_tensor))
    
    # Stack the 2D FFT results into a 3D tensor (n1, n2, p)
    J_vector_tensor = torch.stack(fft_results, dim=2)
    
    # Return the shifted tensor to match the frequency order
    return torch.fft.fftshift(J_vector_tensor, dim=(-3, -2))

def likelihood(params, df_list):
    """
    Calculates the negative log-likelihood for the multivariate model.
    
    Args:
        params (list): Model parameters.
        df_list (list): A list of DataFrames representing the multivariate data.
        
    Returns:
        float: The negative log-likelihood value.
    """
    p = len(df_list)
    if p == 0:
        return torch.tensor(0.0)
    
    # Determine dimensions from the first DataFrame
    n1, n2 = df_list[0].pivot_table(index='Latitude', columns='Longitude', values='laplacian').shape
    n = n1 * n2

    delta1, delta2 = 0.044, 0.063
    
    # Convert parameters to tensors with requires_grad=True for optimization
    params_tensor = [torch.tensor(p, requires_grad=True) for p in params]
    
    # Compute the expected periodogram tensor from the model
    expected_periodogram = expected_periodogram_fft_multivariate(params_tensor, n1, n2, p)
    
    # Generate the J-vector tensor from the data
    j_vector = generate_Jvector(df_list)
    
    nll = torch.tensor(0.0, dtype=torch.complex64)
    
    # Loop over all spatial frequencies (u1, u2)
    for u1 in range(n1):
        for u2 in range(n2):
            # Extract the p x p matrix for the expected periodogram at this frequency
            I_omega = expected_periodogram[u1, u2, :, :]
            
            # Extract the p-dimensional J vector at this frequency
            J_omega = j_vector[u1, u2, :]
            
            # Ensure matrices are numerically stable before inversion
            I_omega_real = torch.real(I_omega)
            I_omega_stable = I_omega_real + 1e-10 * torch.eye(p)
            
            # The log-likelihood term for this frequency is:
            # log(det(I)) + J^H * I^-1 * J
            # where J^H is the conjugate transpose of J
            try:
                # Compute the term log(det(I))
                term1 = torch.log(torch.det(I_omega_stable))

                # Compute the quadratic form J^H * I^-1 * J
                # torch.conj(J_omega).T is J^H (Hermitian conjugate)
                # @ is the matrix multiplication operator
                inv_I = torch.inverse(I_omega_stable)
                term2 = torch.conj(J_omega).T @ inv_I @ J_omega
                
                # Add the terms to the total negative log-likelihood
                nll += (term1 + term2)
            
            except torch.linalg.LinAlgError:
                # Handle cases where the matrix is singular or not invertible
                return torch.tensor(float('inf'))

    # Normalize the total negative log-likelihood
    return torch.real(nll) / n

params = [20, 0.5, 0.5, 0.1, 0.1, 0.1, 0.1]  # Example parameters: [sigmasq, range_lat, range_lon, advec_lat, advec_lon, beta, nugget]
params = torch.tensor(params, requires_grad=True)
likelihood(params, df_list)


import covariance matrix

In [48]:
v05_base_path = Path("/Users/joonwonlee/Documents/GEMS_TCO-1/outputs/day/estimates/df_cv_smooth_05/")


#full_day_r2s10_v045_spline1250 = pd.read_csv( base_path / "full_day_r2s10_v045_spline1250.0.csv")
#full_day_r2s10_v055_spline1250 = pd.read_csv( base_path / "full_day_r2s10_v055_spline1250.0.csv")

full_day_v05_r2s10_1127 = pd.read_csv(v05_base_path / "full_day_v05_r2s10_1127.csv")
vecchia_v05_r2s10_1127 = pd.read_csv( v05_base_path / "vecchia_v05_r2s10_1127.csv")
vecchia_v05_r2s10_4508 = pd.read_csv( v05_base_path / "vecchia_v05_r2s10_4508.csv")
vecchia_v05_r2s10_18033 = pd.read_csv( v05_base_path / "vecchia_v05_r2s10_18033.csv")

In [59]:
estimates = vecchia_v05_r2s10_18033.iloc[:,5:-2]
day1 = estimates.iloc[0]
day1

sigma            21.197
range_lat         1.267
range_lon         1.635
advec_lat         0.026
advec_lon        -0.162
beta              0.172
nugget            4.814
loss         198194.371
Name: 0, dtype: float64

In [75]:
from typing import Dict, Any, Callable

class spatio_temporal_kernels:               #sigmasq range advec beta  nugget
    def __init__(self, smooth:float, aggregated_data: torch.Tensor):
  
        self.smooth = smooth
      
        self.aggregated_data = aggregated_data[:,:4]
        self.aggregated_response = aggregated_data[:,2]
        self.aggregated_locs = aggregated_data[:,:2]



    ## The torch.sqrt() is moved to the covariance function to track gradients of beta and avec
    def custom_distance_matrix(self, U:torch.Tensor, V:torch.Tensor):
        # Efficient distance computation with broadcasting
        spatial_diff = torch.norm(U[:, :2].unsqueeze(1) - V[:, :2].unsqueeze(0), dim=2)
        temporal_diff = torch.abs(U[:, 2].unsqueeze(1) - V[:, 2].unsqueeze(0))
        distance = (spatial_diff**2 + temporal_diff**2)  # move torch.sqrt to covariance function to track gradients of beta and avec
        return distance
    
    def precompute_coords_anisotropy(self, params:torch.Tensor, y: torch.Tensor, x: torch.Tensor)-> torch.Tensor:
        sigmasq, range_lat, range_lon, advec_lat, advec_lon, beta, nugget = params

        if y is None or x is None:
            raise ValueError("Both y and x_df must be provided.")

        x1, y1, t1 = x[:, 0], x[:, 1], x[:, 3]
        x2, y2, t2 = y[:, 0], y[:, 1], y[:, 3]

        # spat_coord1 = torch.stack((self.x1 , self.y1 - advec * self.t1), dim=-1)
        spat_coord1 = torch.stack(( (x1 - advec_lat * t1)/range_lat, (y1 - advec_lon * t1)/range_lon ), dim=-1)
        spat_coord2 = torch.stack(( (x2 - advec_lat * t2)/range_lat, (y2 - advec_lon * t2)/range_lon ), dim=-1)

        U = torch.cat((spat_coord1, (beta * t1).reshape(-1, 1)), dim=1)
        V = torch.cat((spat_coord2, (beta * t2).reshape(-1, 1)), dim=1)

        distance = self.custom_distance_matrix(U,V)
        non_zero_indices = distance != 0
        return distance, non_zero_indices
 
    def matern_cov_anisotropy_v05(self,params: torch.Tensor, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
        sigmasq, range_lat, range_lon, advec_lat, advec_lon, beta, nugget = params
        
        distance, non_zero_indices = self.precompute_coords_anisotropy(params, x,y)
        out = torch.zeros_like(distance)

        non_zero_indices = distance != 0
        if torch.any(non_zero_indices):
            out[non_zero_indices] = sigmasq * torch.exp(- torch.sqrt(distance[non_zero_indices]))
        out[~non_zero_indices] = sigmasq

        # Add a small jitter term to the diagonal for numerical stability
        out += torch.eye(out.shape[0], dtype=torch.float64) * nugget 
        return out

In [None]:

final_df_clean = final_df.apply(pd.to_numeric, errors='coerce')

# Step 2: Fill or drop NaNs
final_df_clean = final_df_clean.fillna(0)  # or use .dropna()

# Step 3: Convert to tensor
tensor_data = torch.tensor(final_df_clean.values, dtype=torch.float32)


instance = spatio_temporal_kernels(smooth=0.5, aggregated_data= tensor_data)
params = list(day1[:-1])
cov_matrix = instance.matern_cov_anisotropy_v05(torch.tensor(params, dtype=torch.float64), tensor_data, tensor_data)

Fortunately, there's a much more efficient method based on the Wiener-Khinchin theorem. Under the same circulant/periodic assumption that the FFT relies on, the power spectrum (which is exactly the diagonal elements you want) is simply the Fourier transform of the process's autocorrelation function.

In [None]:
import numpy as np
import pandas as pd
from scipy import fft

def get_3d_fft_coefficients(data: np.ndarray, grid_shape: tuple = (66, 130, 8)) -> np.ndarray:
    """
    Computes the 3D Discrete Fourier Transform (DFT) of a 1D spatio-temporal data array.

    This function assumes the input data is flattened from a 3D grid where
    the time dimension changes slowest, then longitude, then latitude. It reshapes
    the data, transposes it to a (latitude, longitude, time) order, and then
    computes the 3D FFT.

    Args:
        data (np.ndarray): A 1D NumPy array containing the flattened data.
                           The flattening order is assumed to be (time, longitude, latitude).
        grid_shape (tuple): The desired conceptual dimensions of the 3D grid
                            in (latitude, longitude, time) order.

    Returns:
        np.ndarray: A 1D NumPy array of size (66*130*8) containing the complex
                    3D DFT coefficients.
    """
    # 1. Verify that the data size matches the expected grid dimensions
    expected_size = np.prod(grid_shape)
    if data.size != expected_size:
        raise ValueError(f"Input data size is {data.size}, but expected {expected_size}.")
    
    # 2. Determine the reshape order from the described flattening (time, lon, lat)
    # The conceptual grid_shape is (lat, lon, time) -> (66, 130, 8)
    # So the physical reshape order must be (time, lon, lat) -> (8, 130, 66)
    reshape_order = (grid_shape[2], grid_shape[1], grid_shape[0])
    
    # 3. Reshape the 1D data into a 3D grid based on its physical storage layout
    grid_physical_order = data.reshape(reshape_order) # Current shape: (8, 130, 66)

    # 4. Transpose the grid to match the conceptual order for analysis (lat, lon, time)
    # Current axes: 0=time, 1=lon, 2=lat
    # Target axes:  0=lat, 1=lon, 2=time
    # We need to map the old axes (2, 1, 0) to the new axes (0, 1, 2)
    grid_conceptual_order = np.transpose(grid_physical_order, (2, 1, 0))

    # 5. Compute the n-dimensional (3D in this case) Fast Fourier Transform
    grid_fft = fft.fftn(grid_conceptual_order)

    # 6. Flatten the 3D grid of coefficients into a 1D array to be added to a DataFrame
    return grid_fft.flatten()

def get_3d_fft_from_dataframe(df: pd.DataFrame, grid_shape: tuple = (66, 130, 8)) -> np.ndarray:
    """
    Computes the 3D DFT for the 'ColumnAmountO3' column from a spatio-temporal DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame containing flattened 3D data.
        grid_shape (tuple): The (latitude, longitude, time) dimensions of the grid.

    Returns:
        np.ndarray: A 1D array of complex 3D DFT coefficients.
    """
    z_vector = df['ColumnAmountO3'].values
    return get_3d_fft_coefficients(z_vector, grid_shape)

def compute_power_spectrum(covariance_func, grid_shape: tuple = (66, 130, 8)) -> np.ndarray:
    """
    Computes the theoretical power spectrum from a stationary covariance function.

    Under the assumption of a stationary and circulant process (the same assumption
    made by the FFT), the power spectrum is the DFT of the autocorrelation function.
    The autocorrelation function is equivalent to the first row/column of the
    circulant covariance matrix Sigma.

    Args:
        covariance_func: A function that takes a displacement tuple (d_lat, d_lon, d_time)
                         and returns a single covariance value. This function defines
                         your estimated model for Sigma.
        grid_shape (tuple): The (latitude, longitude, time) dimensions of the grid.

    Returns:
        np.ndarray: A 1D array containing the real-valued power spectrum, which corresponds
                    to the diagonal elements of the D * Sigma_hat * D^H matrix.
    """
    n_lat, n_lon, n_time = grid_shape
    
    # Construct the first row/column of the theoretical covariance matrix.
    # This is the autocorrelation function of the process.
    autocorr_grid = np.zeros(grid_shape)
    for i in range(n_lat):
        for j in range(n_lon):
            for k in range(n_time):
                autocorr_grid[i, j, k] = covariance_func((i, j, k))
                
    # By the Wiener-Khinchin theorem, the power spectrum is the DFT of the
    # autocorrelation function.
    power_spectrum_grid = fft.fftn(autocorr_grid)
    
    # The power spectrum of a real-valued process is real. We take .real to
    # discard negligible imaginary parts from numerical floating point errors.
    return power_spectrum_grid.flatten().real


array([ 1.78672437e+07    -0.j        , -6.25720019e+03+32639.77112216j,
       -7.10920442e+03+21337.61938j   , ...,
        6.82686846e+01 +1973.56554526j,  1.03793847e+03  +893.84617913j,
       -1.67759824e+03 +8482.39478284j])

just get the D

In [14]:
df2


Unnamed: 0,Latitude,Longitude,ColumnAmountO3,Hours_elapsed,Time,Source_Latitude,Source_Longitude
0,2.8878,131.6768,258.35470,477720.866667,2024-07-01 00:52:00,2.887841,131.699000
1,2.8878,131.6138,259.93120,477720.866667,2024-07-01 00:52:00,2.887996,131.636920
2,2.8878,131.5508,259.26990,477720.866667,2024-07-01 00:52:00,2.888159,131.573580
3,2.8878,131.4878,256.45093,477720.866667,2024-07-01 00:52:00,2.888175,131.511760
4,2.8878,131.4248,255.34843,477720.866667,2024-07-01 00:52:00,2.888187,131.449420
...,...,...,...,...,...,...,...
68635,0.0278,123.8018,260.99780,477727.800000,2024-07-01 07:48:00,0.026326,123.783195
68636,0.0278,123.7388,259.03363,477727.800000,2024-07-01 07:48:00,0.026288,123.721146
68637,0.0278,123.6758,260.59440,477727.800000,2024-07-01 07:48:00,0.026334,123.658806
68638,0.0278,123.6128,260.73022,477727.800000,2024-07-01 07:48:00,0.026321,123.595980


In [None]:
import numpy as np
import pandas as pd
from scipy import fft

def get_3d_fft_coefficients(data: np.ndarray, grid_shape: tuple) -> np.ndarray:
    """
    Computes the 3D Discrete Fourier Transform (DFT) of a 1D spatio-temporal data array.
    This is a helper function used to define the transformation for one vector.
    
    CORRECTED: This version assumes the input data is flattened where time is the slowest
    changing dimension, then latitude, and finally longitude is the fastest.
    """
    expected_size = np.prod(grid_shape)
    if data.size != expected_size:
        raise ValueError(f"Input data size is {data.size}, but expected {expected_size}.")

    # CORRECTED: The physical reshape order must be (time, lat, lon) based on the DataFrame structure.
    # grid_shape (conceptual) is (lat, lon, time) -> (66, 130, 8)
    # reshape_order (physical) is (time, lat, lon) -> (8, 66, 130)
    reshape_order = (grid_shape[2], grid_shape[0], grid_shape[1])
    grid_physical_order = data.reshape(reshape_order) # Current shape: (8, 66, 130)

    # CORRECTED: Transpose the grid to the conceptual order (lat, lon, time) for analysis.
    # Current axes: 0=time, 1=lat, 2=lon
    # Target axes:  0=lat, 1=lon, 2=time
    # We need to map the old axes (1, 2, 0) to the new axes (0, 1, 2)
    grid_conceptual_order = np.transpose(grid_physical_order, (1, 2, 0))

    grid_fft = fft.fftn(grid_conceptual_order)
    
    # CORRECTED: To ensure Y and Z are flattened in the same (time, lat, lon) order,
    # we must transpose the coefficients back to the physical order before flattening.
    # The inverse transpose of (1, 2, 0) is (2, 0, 1).
    coeffs_physical_order = np.transpose(grid_fft, (2, 0, 1))

    # Now, flatten the re-ordered grid to match the original Z's flattening order.
    return coeffs_physical_order.flatten()

def construct_3d_dft_matrix(grid_shape: tuple) -> np.ndarray:
    """
    Constructs the explicit transformation matrix D for the 3D DFT.

    The matrix D transforms a flattened data vector Z into its DFT coefficients Y
    via the multiplication Y = DZ. Each column of D is the result of applying
    the DFT to a standard basis vector.

    WARNING: This function is extremely memory-intensive. For a grid of size
    N = 66*130*8 = 68640, the resulting matrix D will have dimensions
    (N, N), requiring over 75 GB of RAM. Use with caution.

    Args:
        grid_shape (tuple): The (latitude, longitude, time) dimensions of the grid.

    Returns:
        np.ndarray: The complex-valued (N, N) transformation matrix D.
    """
    n_total = np.prod(grid_shape)
    
    # Create an identity matrix. Each column is a standard basis vector.
    identity_matrix = np.eye(n_total)
    
    # Initialize the D matrix. It will be complex.
    d_matrix = np.zeros((n_total, n_total), dtype=np.complex128)
    
    print(f"Constructing ({n_total} x {n_total}) DFT matrix. This may take a long time...")
    
    # Each column of D is the FFT of the corresponding column of the identity matrix.
    for i in range(n_total):
        basis_vector = identity_matrix[:, i]
        d_matrix[:, i] = get_3d_fft_coefficients(basis_vector, grid_shape)
        if (i + 1) % 50 == 0:
            print(f"  ...processed column {i+1} of {n_total}")
            
    print("Matrix construction complete.")
    return d_matrix

# --- Example Usage ---
if __name__ == '__main__':
    # --- Part 1: Using the efficient function with your full DataFrame ---
    
    full_grid_shape = (66, 130, 8)
    full_total_size = np.prod(full_grid_shape)
    
    # Assume 'df2' is your DataFrame. For a runnable example, we create a sample df2.
    print(f"--- Demonstration with a full-sized sample DataFrame ({full_total_size} rows) ---")
    sample_data = {'ColumnAmountO3': np.random.randn(full_total_size)}
    df2 = pd.DataFrame(sample_data)
    
    # This is how you would use the efficient function on your actual data
    print("Calculating FFT coefficients from the full DataFrame using the efficient function...")
    y_from_df_function = get_3d_fft_coefficients(df2['ColumnAmountO3'].values, full_grid_shape)
    print(f"Successfully calculated {y_from_df_function.size} coefficients.")
    print("-" * 50)

    # --- Part 2: Demonstrating the D matrix construction on a small scale ---
    
    # WARNING: Using the full grid_shape for D will likely cause a MemoryError.
    # We use a tiny grid to demonstrate the principle and verify correctness.
    small_grid_shape = (2, 3, 2)
    small_total_size = np.prod(small_grid_shape)
    
    print(f"\n--- Demonstrating D matrix construction on a small {small_grid_shape} grid ---")

    # 1. Construct the explicit D matrix for the small grid.
    D = construct_3d_dft_matrix(small_grid_shape)
    
    # 2. Create a random data vector Z of the appropriate small size for verification.
    z_vector_small = np.random.randn(small_total_size)
    
    # 3. --- Verification Step ---
    # Compute Y using the direct function.
    y_from_function = get_3d_fft_coefficients(z_vector_small, small_grid_shape)
    
    # Compute Y by multiplying by the constructed D matrix.
    y_from_matrix = D @ z_vector_small
    
    # 4. Check if the results are numerically very close.
    is_verified = np.allclose(y_from_function, y_from_matrix)
    
    print("\n--- Verification ---")
    print(f"Results are identical: {is_verified}")
    
    if is_verified:
        print("\nThe D matrix was constructed correctly for the small grid.")
    else:
        print("\nError: The D matrix construction failed verification.")