In [22]:

# Special functions and optimizations
from typing import Callable, Union, Tuple
from scipy.spatial.distance import cdist  # For space and time distance
from scipy.special import gamma, kv  # Bessel function and gamma function
from scipy.optimize import minimize
from scipy.optimize import basinhopping, minimize
import pandas as pd
import numpy as np

# Create a DataFrame with 3 columns: longitude, latitude, and time
data = {
    'longitude': np.random.uniform(-180, 180, 100),
    'latitude': np.random.uniform(-90, 90, 100),
    'time': np.random.uniform(90, 180, 100)
}

df = pd.DataFrame(data)
print(df)

df = df.to_numpy()

     longitude   latitude        time
0   -41.042413  72.311482  128.066920
1  -118.658691 -33.886564  123.977527
2   -67.244260 -43.615308  107.831699
3   -22.693072  32.253074  150.683464
4   -54.507869  51.103011  135.731688
..         ...        ...         ...
95  -69.500039 -80.200911  164.189459
96   89.977123  41.736301  151.724857
97   97.012391  17.828390  171.285136
98  162.892887  53.241542  178.931654
99  117.575736  -2.605172  102.154213

[100 rows x 3 columns]


In [46]:
smooth = 0.5
params = [20,8,8,0.5,0.5,0.1]

range_lon, range_lat = params[1], params[2]
sqrt_range_mat = np.diag([ 1/range_lon**0.5, 1/range_lat**0.5])
sqrt_range_mat = sqrt_range_mat


# Custom distance function for cdist
def custom_distance(u, v):
    d = np.dot(sqrt_range_mat, u[:2] - v[:2] ) # Distance between x1,x2 (2D)
    spatial_diff = np.linalg.norm(d)  # Distance between x1,x2 (2D)
    temporal_diff = np.abs(u[2] - v[2])           # Distance between y1 and y2
    return np.sqrt(spatial_diff**2 + temporal_diff**2)


def matern_cov_yx_test(params: Tuple[float,float,float,float,float,float], y: np.ndarray, x: np.ndarray) -> np.ndarray:

    sigmasq, range_lat, range_lon, advec, beta, nugget  = params
    # Validate inputs
    if y is None or x is None:
        raise ValueError("Both y and x_df must be provided.")
    # Extract values
    x1 = x[:, 0]
    y1 = x[:, 1]
    t1 = x[:, 2]

    x2 = y[:, 0]
    y2 = y[:, 1]
    t2 = y[:, 2] # hour

    spat_coord1 = np.stack((x1- advec*t1, y1 - advec*t1), axis=-1)
    spat_coord2 = np.stack((x2- advec*t2, y2 - advec*t2), axis=-1)

    coords1 = np.hstack ((spat_coord1, (beta * t1).reshape(-1,1) ))
    coords2 = np.hstack ((spat_coord2, (beta * t2).reshape(-1,1) ))



    distance = cdist(coords1,coords2, metric = custom_distance)

    # Initialize the covariance matrix with zeros
    out = distance
    
    # Compute the covariance for non-zero distances

    # Compute the covariance for non-zero distances
    non_zero_indices = distance != 0
    if np.any(non_zero_indices):
        out[non_zero_indices] = (sigmasq * (2**(1-smooth)) / gamma(smooth) *
                                (distance[non_zero_indices])**smooth *
                                kv(smooth, distance[non_zero_indices]))
    out[~non_zero_indices] = sigmasq

    # Add a small jitter term to the diagonal for numerical stability
    out += np.eye(out.shape[0]) * nugget
    return out


b=matern_cov_yx_test(params,df,df)

def matern2(params: Tuple[float,float,float,float,float,float], y: np.ndarray, x: np.ndarray) -> np.ndarray:

    sigmasq, range_lat, range_lon, advec, beta, nugget  = params
    # Validate inputs
    if y is None or x is None:
        raise ValueError("Both y and x_df must be provided.")
    # Extract values
    x1 = x[:, 0]
    y1 = x[:, 1]
    t1 = x[:, 2]

    x2 = y[:, 0]
    y2 = y[:, 1]
    t2 = y[:, 2] # hour

    spat_coord1 = np.stack((x1- advec*t1, y1 - advec*t1), axis=-1)
    spat_coord2 = np.stack((x2- advec*t2, y2 - advec*t2), axis=-1)

    coords1 = np.hstack ((spat_coord1, (beta * t1).reshape(-1,1) ))
    coords2 = np.hstack ((spat_coord2, (beta * t2).reshape(-1,1) ))


    distance = cdist(coords1,coords2, metric = custom_distance)

    # Initialize the covariance matrix with zeros
    out = distance
    
    # Compute the covariance for non-zero distances

    # Compute the covariance for non-zero distances
    non_zero_indices = distance != 0
    if np.any(non_zero_indices):
        out[non_zero_indices] = sigmasq* np.exp(-distance[non_zero_indices])
    out[~non_zero_indices] = sigmasq
    

    # Add a small jitter term to the diagonal for numerical stability
    out += np.eye(out.shape[0]) * nugget
    return out
a = matern2(params,df,df)

print(b)

[[2.01000000e+01 3.33031983e-19 1.95198265e-16 ... 9.36410262e-23
  9.35396181e-30 5.38999394e-28]
 [3.33031983e-19 2.01000000e+01 3.26513188e-09 ... 6.07038134e-31
  3.08349456e-41 3.25875428e-38]
 [1.95198265e-16 3.26513188e-09 2.01000000e+01 ... 2.02080667e-24
  2.38265321e-34 4.84064237e-29]
 ...
 [9.36410262e-23 6.07038134e-31 2.02080667e-24 ... 2.01000000e+01
  3.02474639e-10 8.53576657e-17]
 [9.35396181e-30 3.08349456e-41 2.38265321e-34 ... 3.02474639e-10
  2.01000000e+01 2.40667027e-16]
 [5.38999394e-28 3.25875428e-38 4.84064237e-29 ... 8.53576657e-17
  2.40667027e-16 2.01000000e+01]]


In [50]:
np.sum(a-b)

np.float64(-2.2237754044130276e-15)

In [None]:
import math

def matern_cov(d,v):
    abs_d = np.abs(d)
    if abs_d ==0:
        return 1
    else:
        sigmasq = 2
        range = 1
        out = sigmasq * (2**(1-v))/math.gamma(v) * (abs_d/range)**(v)*kv(v, abs_d/range)        
        return out  
    
print(matern_cov(4,1.5))
d = 4
abs_d = np.abs(d)
sigmasq = 2
range_ = 1
out = sigmasq * (1+ abs_d/range_)* np.exp(-abs_d/range_)
print(out)



0.18315638888734181
0.1831563888873418


Vecchia

In [1]:
# work environment: jl2815
# Standard libraries
import sys
import logging
import argparse # Argument parsing
import math
from collections import defaultdict
import concurrent
from concurrent.futures import ThreadPoolExecutor  # Importing specific executor for clarity
import time 

# Data manipulation and analysis
import pandas as pd
import numpy as np

# Nearest neighbor search
import sklearn
from sklearn.neighbors import BallTree

# Special functions and optimizations
from scipy.special import gamma, kv  # Bessel function and gamma function
from scipy.stats import multivariate_normal  # Simulation
from scipy.optimize import minimize
from scipy.spatial.distance import cdist  # For space and time distance
from scipy.spatial import distance  # Find closest spatial point
from scipy.optimize import differential_evolution

# Plotting and visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Type hints
from typing import Callable, Union, Tuple

# Add your custom path
sys.path.append("/cache/home/jl2815/tco")

# Custom imports
from GEMS_TCO import orbitmap 
from GEMS_TCO import kernels 
from GEMS_TCO import smoothspace

import pickle

In [17]:

lat_lon_resolution = [20,20]
key_for_dict = 4
mm_cond_number=10
# Load the one dictionary to set spaital coordinates
filepath = "C:/Users/joonw/TCO/data_engineering/data_2023/sparse_cen_map23_01.pkl"
""
with open(filepath, 'rb') as pickle_file:
    coarse_dict_24_1 = pickle.load(pickle_file)

sample_df = coarse_dict_24_1['y23m01day01_hm02:12']

sample_key = coarse_dict_24_1.get('y23m01day01_hm02:12')
if sample_key is None:
    print("Key 'y23m01day01_hm02:12' not found in the dictionary.")

# { (20,20):(5,1), (5,5):(20,40) }
rho_lat = lat_lon_resolution[0]          
rho_lon = lat_lon_resolution[1]
lat_n = sample_df['Latitude'].unique()[::rho_lat]
lon_n = sample_df['Longitude'].unique()[::rho_lon]

lat_number = len(lat_n)
lon_number = len(lon_n)

# Set spatial coordinates for each dataset
coarse_dicts = {}

years = ['2024']
for year in years:
    for month in range(7, 8):  # Iterate over all months
        filepath = f"C:/Users/joonw/TCO/data_engineering/data_{year}/sparse_cen_map{year[2:]}_{month:02d}.pkl"
        with open(filepath, 'rb') as pickle_file:
            loaded_map = pickle.load(pickle_file)
            for key in loaded_map:
                tmp_df = loaded_map[key]
                coarse_filter = (tmp_df['Latitude'].isin(lat_n)) & (tmp_df['Longitude'].isin(lon_n))
                coarse_dicts[f"{year}_{month:02d}_{key}"] = tmp_df[coarse_filter].reset_index(drop=True)


key_idx = sorted(coarse_dicts)
if not key_idx:
    raise ValueError("coarse_dicts is empty")

# extract first hour data because all data shares the same spatial grid
data_for_coord = coarse_dicts[key_idx[0]]
x1 = data_for_coord['Longitude'].values
y1 = data_for_coord['Latitude'].values 
coords1 = np.stack((x1, y1), axis=-1)

instance = orbitmap.MakeOrbitdata(df = data_for_coord, lat_s= 5, lat_e=10, lon_s=110, lon_e=120)
s_dist = cdist(coords1, coords1, 'euclidean')
ord_mm, _ = instance.maxmin_naive(s_dist, 0)

data_for_coord = data_for_coord.iloc[ord_mm].reset_index(drop=True)
coords1_reordered = np.stack((data_for_coord['Longitude'].values, data_for_coord['Latitude'].values), axis=-1)
nns_map = instance.find_nns_naive(locs=coords1_reordered, dist_fun='euclidean', max_nn=mm_cond_number)



analysis_data_map = {}
for i in range(key_for_dict):
    tmp = coarse_dicts[key_idx[i]]
    # tmp = tmp.iloc[ord_mm].reset_index(drop=True)  
    tmp = tmp.iloc[ord_mm, :4].to_numpy()
    analysis_data_map[key_idx[i]] = tmp


instance = kernels.matern_spatio_temporal(smooth =0.5, input_map = analysis_data_map, nns_map = nns_map, mm_cond_number = mm_cond_number )
# data = data.iloc[ord,:]

start_time = time.time()
number_of_timestamps = key_for_dict
input_map = analysis_data_map

AttributeError: module 'GEMS_TCO.kernels' has no attribute 'matern_spatio_temporal'

In [None]:

def vecchia_likelihood_test( params: Tuple[float,float,float,float,float,float]):
    neg_log_lik = 0

    for time_idx in range(number_of_timestamps):
        current_np = input_map[key_list[time_idx]]

        # cur_heads = current_df.iloc[:31,:]
        # neg_log_lik += full_likelihood(params,cur_heads, cur_heads["ColumnAmountO3"])

        for index in range(0, size_per_hour):

            current_row = current_np[index]
    
            current_row = current_row.reshape(1,-1)
            print(current_row.shape)
            print(current_row[2])
            current_y = current_row[2]

            # construct conditioning set on time 0
            
            mm_neighbors = nns_map[index]
            past = list(mm_neighbors)
            data_list = []
            if past:
                data_list.append( current_np[past])
        
            if time_idx >0:
                last_hour_np = input_map[key_list[time_idx-1]]
                
                past_conditioning_data = last_hour_np[ (past+[index]),: ]
                data_list.append( past_conditioning_data)
            
            if data_list:
                conditioning_data = np.vstack(data_list)
            else:
                conditioning_data = np.array([])
    

            y_xx = conditioning_data[:,2]
            print(y_xx)
            y_yy = current_y
            # locs = np.array(df[['Latitude','Longitude']])
            locs_xx = conditioning_data[:,:2]
            print(locs_xx)
            locs_yy = current_row[:2]

            cov_xx = matern_cov_yx(params=params, y = conditioning_data, x = conditioning_data)
            cov_yy = matern_cov_yx(params=params, y = current_row, x = current_row)

            # get mean
            
            tmp_xx1 = np.dot(locs_xx.T, np.linalg.solve(cov_xx, locs_xx))
            tmp_xx2 = np.dot(locs_xx.T, np.linalg.solve(cov_xx, y_xx))
            beta_xx = np.linalg.solve(tmp_xx1, tmp_xx2)
            mu_xx = np.dot(locs_xx, beta_xx)

            tmp_yy1 = np.dot(locs_yy.T, np.linalg.solve(cov_yy, locs_yy))
            tmp_yy2 = np.dot(locs_xx.T, np.linalg.solve(cov_yy, y_yy))
            beta_yy = np.linalg.solve(tmp_xx1, tmp_xx2)
            mu_yy = np.dot(locs_yy, beta_yy)

            # mean and variance of y|x
            sigma = cov_yy
            cov_yx = matern_cov_yx(params=params, y_df = current_row, x_df = conditioning_data)
            cov_ygivenx = sigma - np.dot(cov_yx.T,np.linalg.solve(cov_xx, cov_yx))
            
            # cov_ygivenx = max(cov_ygivenx, 7)
            
            cond_mean = mu_yy + np.dot(cov_yx.T, np.linalg.solve( cov_xx, (y_xx- mu_xx) ))   # adjust for bias, mean_xz should be 0 which is not true but we can't do same for y1 so just use mean_z almost 0
            # print(f'cond_mean{mean_z}')

            alpha = current_y - cond_mean
            quad_form = alpha**2 *(1/cov_ygivenx)
            log_det = np.log(cov_ygivenx)
            # Compute the negative log-likelihood

            neg_log_lik += 0.5 * (1 * np.log(2 * np.pi) + log_det + quad_form)
        # prev_prev_df = prev_df
        # prev_df = current_df
    return neg_log_lik   

params = [20,5 ,5, 0.5, 0.5, 0.5]
vecchia_likelihood_test(params)

NameError: name 'key_list' is not defined

In [20]:
if conditioning_data.size == 0:
    print("conditioning_data is empty")
    locs_xx = np.array([]).reshape(0, 2)  # Create an empty array with the appropriate shape
else:
    locs_xx = conditioning_data[:, :2]

print(locs_xx)


conditioning_data is empty
[]


In [27]:
def ab(n,m,k):
    out = n**3/(n*m**3)
    print(n**3)
    print(n*m**3/k)
    return out/k

ab(800,20,5)

512000000
1280000.0


16.0