In [1]:
# work environment: jl2815
# Standard libraries
import sys
import logging
import argparse # Argument parsing
import math
from collections import defaultdict
import concurrent
from concurrent.futures import ThreadPoolExecutor  # Importing specific executor for clarity
import time

# Data manipulation and analysis
import pandas as pd
import numpy as np

# Nearest neighbor search
import sklearn
from sklearn.neighbors import BallTree

# Special functions and optimizations
from scipy.special import gamma, kv  # Bessel function and gamma function
from scipy.stats import multivariate_normal  # Simulation
from scipy.optimize import minimize
from scipy.spatial.distance import cdist  # For space and time distance
from scipy.spatial import distance  # Find closest spatial point
from scipy.optimize import differential_evolution

# Plotting and visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Type hints
from typing import Callable, Union, Tuple

# Add your custom path
sys.path.append("/cache/home/jl2815/tco")

# Custom imports
from GEMS_TCO import orbitmap 
from GEMS_TCO import kernels 
from GEMS_TCO import smoothspace

import pickle

# Summary

The problem was largely due to how the data was pre-processed.

I should have used center-matching process. It makes huge difference. Also it makes less singularity problem which 
also affects the performance of Vecchia likelihoods. 

# Test averaging processed data

In [9]:
df = pd.read_csv("C:\\Users\\joonw\\TCO\\data_engineering\\data_2024\\data_24_07_0131_N510_E110120.csv")

In [10]:
instance = orbitmap.MakeOrbitdata(df = df,lat_s=5,lat_e=10,lon_s=110,lon_e=120,lat_resolution=.4,lon_resolution=.4)

filepath ="C:\\Users\\joonw\\TCO\\data_engineering\\data_2024\\orbit_map24_07.pkl"
with open(filepath,'rb') as pickle_file:
    coarse_dict_24_1 = pickle.load(pickle_file)

sparse_map24_7 = instance.make_sparsemap(coarse_dict_24_1, .4 )


In [11]:
sample_df = sparse_map24_7['y24m07day01_hm01:00']
coarse_dicts = sparse_map24_7

mm_cond_number=10
key_for_dict = 2

key_idx = sorted(coarse_dicts)
if not key_idx:
    raise ValueError("coarse_dicts is empty")

# extract first hour data because all data shares the same spatial grid
data_for_coord = coarse_dicts[key_idx[0]]
x1 = data_for_coord['Longitude'].values
y1 = data_for_coord['Latitude'].values 
coords1 = np.stack((x1, y1), axis=-1)

instance = orbitmap.MakeOrbitdata()
s_dist = cdist(coords1, coords1, 'euclidean')
ord_mm, _ = instance.maxmin_naive(s_dist, 0)

data_for_coord = data_for_coord.iloc[ord_mm].reset_index(drop=True)
coords1_reordered = np.stack((data_for_coord['Longitude'].values, data_for_coord['Latitude'].values), axis=-1)
nns_map = instance.find_nns_naive(locs=coords1_reordered, dist_fun='euclidean', max_nn=mm_cond_number)



analysis_data_map = {}
for i in range(key_for_dict):
    tmp = coarse_dicts[key_idx[i]]
    tmp = tmp.iloc[ord_mm].reset_index(drop=True)  
    analysis_data_map[key_idx[i]] = tmp

aggregated_data = pd.DataFrame()
for i in range((key_for_dict)):
    tmp = coarse_dicts[key_idx[i]]
    tmp = tmp.iloc[ord_mm].reset_index(drop=True)  
    aggregated_data = pd.concat((aggregated_data, tmp), axis=0)

lat_n = sample_df['Latitude'].unique()
lon_n = sample_df['Longitude'].unique()

lat_number = len(lat_n)
lon_number = len(lon_n)

print(lat_number)
print(lon_number)

12
25


In [12]:
params = [0.5,0.5,0.5,0.5,0.5,0.5]

print(f'aggregated_data {aggregated_data.shape}')
#####################################################################

instance = kernels.matern_spatio_temporal(smooth = 0.5, input_map = analysis_data_map, nns_map = nns_map, mm_cond_number = mm_cond_number )
# data = data.iloc[ord,:]
out = instance.vecchia_likelihood(params)

start_time = time.time()
print(f'grid {lat_number}*{lon_number}:Full likelihood using {params} is {instance.full_likelihood(params, aggregated_data, aggregated_data["ColumnAmountO3"])}')
end_time = time.time()  # Record the end time
iteration_time = end_time - start_time  # Calculate the time spent
print(f"full likelihood {key_for_dict}time points took {iteration_time:.4f} seconds")

start_time = time.time()
print(f'grid {lat_number}*{lon_number}:Vecchia approximation likelihood using condition size {mm_cond_number}, {params} is {out}')
end_time = time.time()  # Record the end time
iteration_time = end_time - start_time  # Calculate the time spent
print(f"vecchia {key_for_dict}time points took {iteration_time:.4f} seconds")

aggregated_data (600, 5)
grid 12*25:Full likelihood using [0.5, 0.5, 0.5, 0.5, 0.5, 0.5] is 3741.598955706946
full likelihood 2time points took 2.8065 seconds
grid 12*25:Vecchia approximation likelihood using condition size 10, [0.5, 0.5, 0.5, 0.5, 0.5, 0.5] is 3109.36234130164
vecchia 2time points took 0.0000 seconds


# Center matching data preprocessing

filepath = "C:\\Users\\joonw\\TCO\\data_engineering\\data_2024\\sparse_cen_map24_07.pkl"

In [2]:
# Load the one dictionary to set spaital coordinates
filepath = "C:\\Users\\joonw\\TCO\\data_engineering\\data_2024\\sparse_cen_map24_07.pkl"

with open(filepath, 'rb') as pickle_file:
    coarse_dict_24_1 = pickle.load(pickle_file)


sample_df = coarse_dict_24_1['y24m07day01_hm01:00']


lat_lon_resolution = [20,20]

# { (20,20):(5,1), (5,5):(20,40) }
rho_lat = lat_lon_resolution[0]          
rho_lon = lat_lon_resolution[1]
lat_n = sample_df['Latitude'].unique()[::rho_lat]
lon_n = sample_df['Longitude'].unique()[::rho_lon]

lat_number = len(lat_n)
print(lat_n)
print(lon_n)
lon_number = len(lon_n)

# Set spatial coordinates for each dataset
coarse_dicts = {}
for key in coarse_dict_24_1:
    tmp_df = coarse_dict_24_1[key]
    coarse_filter = (tmp_df['Latitude'].isin(lat_n)) & (tmp_df['Longitude'].isin(lon_n))
    coarse_dicts[f"{2024}_{7:02d}_{key}"] = tmp_df[coarse_filter].reset_index(drop=True)


print(lat_number)
print(lon_number)
key_idx = sorted(coarse_dicts)
if not key_idx:
    raise ValueError("coarse_dicts is empty")

# extract first hour data because all data shares the same spatial grid
data_for_coord = coarse_dicts[key_idx[0]]
x1 = data_for_coord['Longitude'].values
y1 = data_for_coord['Latitude'].values 
coords1 = np.stack((x1, y1), axis=-1)



[5.025 6.025 7.025 8.025 9.025]
[110.025 111.025 112.025 113.025 114.025 115.025 116.025 117.025 118.025
 119.025]
5
10


In [8]:
mm_cond_number = 10
key_for_dict = 2

instance = orbitmap.MakeOrbitdata()
s_dist = cdist(coords1, coords1, 'euclidean')
ord_mm, _ = instance.maxmin_naive(s_dist, 0)

data_for_coord = data_for_coord.iloc[ord_mm].reset_index(drop=True)
coords1_reordered = np.stack((data_for_coord['Longitude'].values, data_for_coord['Latitude'].values), axis=-1)
nns_map = instance.find_nns_naive(locs=coords1_reordered, dist_fun='euclidean', max_nn=mm_cond_number)



analysis_data_map = {}
for i in range(key_for_dict):
    tmp = coarse_dicts[key_idx[i]]
    tmp = tmp.iloc[ord_mm].reset_index(drop=True)  
    analysis_data_map[key_idx[i]] = tmp

aggregated_data = pd.DataFrame()
for i in range((key_for_dict)):
    tmp = coarse_dicts[key_idx[i]]
    tmp = tmp.iloc[ord_mm].reset_index(drop=True)  
    aggregated_data = pd.concat((aggregated_data, tmp), axis=0)


print(f'aggregated_data {aggregated_data.shape}')
print(aggregated_data.to_string())


#####################################################################

aggregated_data (100, 5)
    Latitude  Longitude  ColumnAmountO3  Hours_elapsed                 Time
0      5.025    110.025       273.39870       477721.0  2024-07-01 01:00:00
1      9.025    119.025       276.24142       477721.0  2024-07-01 01:00:00
2      5.025    116.025       261.51453       477721.0  2024-07-01 01:00:00
3      9.025    113.025       268.22525       477721.0  2024-07-01 01:00:00
4      6.025    113.025       272.66770       477721.0  2024-07-01 01:00:00
5      6.025    119.025       259.94556       477721.0  2024-07-01 01:00:00
6      8.025    110.025       269.52832       477721.0  2024-07-01 01:00:00
7      8.025    116.025       267.32480       477721.0  2024-07-01 01:00:00
8      5.025    112.025       271.26544       477721.0  2024-07-01 01:00:00
9      5.025    114.025       268.74588       477721.0  2024-07-01 01:00:00
10     5.025    118.025       265.10030       477721.0  2024-07-01 01:00:00
11     6.025    111.025       273.67868       477721.0  2024-07

In [7]:
params = [60,8.25,8.25,0.5,0.5,0.5]

instance = kernels.matern_spatio_temporal(smooth = 0.5, input_map = analysis_data_map, nns_map = nns_map, mm_cond_number = mm_cond_number )
# data = data.iloc[ord,:]
out = instance.vecchia_likelihood(params)

start_time = time.time()
print(f'grid {lat_number}*{lon_number}:Full likelihood using {params} is {instance.full_likelihood(params, aggregated_data, aggregated_data["ColumnAmountO3"])}')
end_time = time.time()  # Record the end time
iteration_time = end_time - start_time  # Calculate the time spent
print(f"full likelihood {key_for_dict}time points took {iteration_time:.4f} seconds")

start_time = time.time()
print(f'grid {lat_number}*{lon_number}:Vecchia approximation likelihood using condition size {mm_cond_number}, {params} is {out}')
end_time = time.time()  # Record the end time
iteration_time = end_time - start_time  # Calculate the time spent
print(f"vecchia {key_for_dict}time points took {iteration_time:.4f} seconds")

# 2137.099
# 2499.2030

grid 5*10:Full likelihood using [60, 8.25, 8.25, 0.5, 0.5, 0.5] is 274.19630949798125
full likelihood 2time points took 0.0768 seconds
grid 5*10:Vecchia approximation likelihood using condition size 10, [60, 8.25, 8.25, 0.5, 0.5, 0.5] is 291.7490506399612
vecchia 2time points took 0.0000 seconds
