In [1]:
%load_ext autoreload

import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # NOTSET, DEBUG, INFO, WARN, ERROR, CRITICAL

from JPAS_DA import global_setup
from JPAS_DA.data import loading_tools
from JPAS_DA.data import cleaning_tools
from JPAS_DA.data import crossmatch_tools
from JPAS_DA.data import process_dset_splits

import numpy as np

from JPAS_DA.utils import plotting_utils
import matplotlib as mpl
import matplotlib.pyplot as plt
plt.style.use('default')
plt.close('all')
font, rcnew = plotting_utils.matplotlib_default_config()
mpl.rc('font', **font)
plt.rcParams.update(rcnew)
plt.style.use('tableau-colorblind10')
%matplotlib inline

In [2]:
root_path = global_setup.DATA_path
load_JPAS_x_DESI_Raul   = global_setup.load_JPAS_x_DESI_Raul
load_DESI_mocks_Raul    = global_setup.load_DESI_mocks_Raul
load_Ignasi             = global_setup.load_Ignasi

random_seed_load = global_setup.default_seed

In [3]:
DATA = loading_tools.load_data_bundle(
    root_path=root_path,
    include=["JPAS_x_DESI_Raul", "DESI_mocks_Raul", "Ignasi"],
    JPAS_x_DESI_Raul={"datasets": load_JPAS_x_DESI_Raul},
    DESI_mocks_Raul={"datasets": load_DESI_mocks_Raul},
    Ignasi={"datasets": load_Ignasi},
    random_seed=random_seed_load,
)

2025-09-17 14:22:43,107 - INFO - 📥 Starting modular dataset loading (load_data_bundle)
2025-09-17 14:22:43,108 - INFO - ├── Loading JPAS_x_DESI_Raul ...
2025-09-17 14:22:43,108 - INFO - ├─── 📥 Starting JPAS_x_DESI_Raul dataset loading...
2025-09-17 14:22:43,108 - INFO - |    ├─── 🔹 Dataset: all (sample 100%)
2025-09-17 14:22:43,155 - INFO - |    |    ✔ CSV loaded: JPAS_DATA_PROPERTIES.csv (shape: (52020, 18))
2025-09-17 14:22:43,167 - INFO - |    |    ✔ NPY loaded: JPAS_DATA_Aper_Cor_3_FLUX+NOISE.npy (obs shape: (52020, 57))
2025-09-17 14:22:43,168 - INFO - ├─── ✅ Finished loading all JPAS datasets.
2025-09-17 14:22:43,169 - INFO - │   ✔ Loaded JPAS_x_DESI_Raul
2025-09-17 14:22:43,169 - INFO - ├── Loading DESI_mocks_Raul ...
2025-09-17 14:22:43,169 - INFO - ├─── 📥 Loading DESI datasets (splitted)...
2025-09-17 14:22:43,169 - INFO - ├─── 📥 Starting DESI dataset loading...
2025-09-17 14:22:43,169 - INFO - |    ├─── 🔹 Dataset: train
2025-09-17 14:22:46,254 - INFO - |    |    ✔ CSV loaded 

In [4]:
config_dict_cleaning = global_setup.config_dict_cleaning

In [5]:
DATA = cleaning_tools.clean_data_pipeline(DATA, config=config_dict_cleaning, in_place=True)

2025-09-17 14:23:05,608 - INFO - 🧹 Cleaning dataset: JPAS_x_DESI_Raul
2025-09-17 14:23:05,609 - INFO - ├── mask_out_unreliable_columns(mask_unreliable_filters_indices=[0, -2])
2025-09-17 14:23:05,617 - INFO - │   ├── Removed columns: [0, 55]
2025-09-17 14:23:05,617 - INFO - │   ├── New #filters: 55
2025-09-17 14:23:05,618 - INFO - │   ├── Updated observations shape: (52020, 55)
2025-09-17 14:23:05,618 - INFO - │   ├── Updated errors shape: (52020, 55)
2025-09-17 14:23:05,618 - INFO - ├── remove_NaNs(check='both', keep_rows_partially_filled_with_NaNs=True)
2025-09-17 14:23:05,619 - INFO - │   ├── rows fully NaN (drop): 0/52020 (0.00%)
2025-09-17 14:23:05,620 - INFO - │   ├── rows with SOME NaNs:   0/52020 (0.00%)
2025-09-17 14:23:05,620 - INFO - │   ├── rows dropped due to policy: 0/52020
2025-09-17 14:23:05,620 - INFO - │   └── final kept: 52020/52020 (100.00%)
2025-09-17 14:23:05,706 - INFO - ├── remove_magic_rows(check='obs', keep_rows_partially_filled_with_magic=True, magic_numbers=

In [6]:
Dict_LoA = {"intersection": {}, "outersection": {}}

IDs1, IDs2, IDs12, \
Dict_LoA["outersection"]["DESI_mocks_Raul"], Dict_LoA["outersection"]["JPAS_x_DESI_Raul"], \
Dict_LoA["intersection"]["DESI_mocks_Raul"], Dict_LoA["intersection"]["JPAS_x_DESI_Raul"] = crossmatch_tools.crossmatch_IDs_two_datasets(
    DATA["DESI_mocks_Raul"]['all_pd']['TARGETID'], DATA["JPAS_x_DESI_Raul"]['all_pd']['TARGETID']
)

2025-09-17 14:23:49,387 - INFO - 🔍 crossmatch_IDs_two_datasets()...
2025-09-17 14:23:49,388 - INFO - ├── 🚀 Starting ID categorization process...
2025-09-17 14:23:49,539 - INFO - |    ├── 📌 Found 1051168 unique IDs across 2 arrays.
2025-09-17 14:23:50,157 - INFO - |    ├── Presence matrix created with shape: (2, 1051168)
2025-09-17 14:23:50,160 - INFO - |    ├── Category mask created with shape: (2, 1051168)
2025-09-17 14:23:50,160 - INFO - ├── 🚀 Starting index retrieval process...
2025-09-17 14:23:50,161 - INFO - |    ├── 📌 Processing 1051168 unique IDs across 2 arrays.
2025-09-17 14:23:50,686 - INFO - ├── 🚀 Starting post-processing of unique IDs across two arrays...
2025-09-17 14:23:50,703 - INFO - |    ├── Processing complete: 1014323 IDs only in Array 1 (96.49%).
2025-09-17 14:23:50,703 - INFO - |    ├── Processing complete: 8 IDs only in Array 2 (0.0%).
2025-09-17 14:23:50,703 - INFO - |    ├── Processing complete: 36837 IDs in both arrays (3.5%).
2025-09-17 14:23:50,703 - INFO - ✅

In [7]:
dict_split_data_options = global_setup.dict_split_data_options

In [8]:
# Split the Lists of Arrays into training, validation, and testing sets
Dict_LoA_split = {"intersection":{}, "outersection":{}}

Dict_LoA_split["intersection"]["JPAS_x_DESI_Raul"] = process_dset_splits.split_LoA(
    Dict_LoA["intersection"]["JPAS_x_DESI_Raul"],
    train_ratio = dict_split_data_options["train_ratio_intersection"],
    val_ratio = dict_split_data_options["val_ratio_intersection"],
    test_ratio = dict_split_data_options["test_ratio_intersection"],
    seed = dict_split_data_options["random_seed_split_intersection"]
)
Dict_LoA_split["intersection"]["DESI_mocks_Raul"] = process_dset_splits.split_LoA(
    Dict_LoA["intersection"]["DESI_mocks_Raul"],
    train_ratio = dict_split_data_options["train_ratio_intersection"],
    val_ratio = dict_split_data_options["val_ratio_intersection"],
    test_ratio = dict_split_data_options["test_ratio_intersection"],
    seed = dict_split_data_options["random_seed_split_intersection"]
)
Dict_LoA_split["outersection"]["DESI_mocks_Raul"] = process_dset_splits.split_LoA(
    Dict_LoA["outersection"]["DESI_mocks_Raul"],
    train_ratio = dict_split_data_options["train_ratio_outersection"],
    val_ratio = dict_split_data_options["val_ratio_outersection"],
    test_ratio = dict_split_data_options["test_ratio_outersection"],
    seed = dict_split_data_options["random_seed_split_outersection"]
)

2025-09-17 14:23:50,737 - INFO - ├── ✂️ Splitting list of arrays (LoA) into train/val/test subsets...
2025-09-17 14:23:50,742 - INFO - ├── Finished splitting.
2025-09-17 14:23:50,742 - INFO - ├── ✂️ Splitting list of arrays (LoA) into train/val/test subsets...
2025-09-17 14:23:50,747 - INFO - ├── Finished splitting.
2025-09-17 14:23:50,747 - INFO - ├── ✂️ Splitting list of arrays (LoA) into train/val/test subsets...
2025-09-17 14:23:50,907 - INFO - ├── Finished splitting.


In [9]:
for ii, key_dset in enumerate(Dict_LoA_split["intersection"]["JPAS_x_DESI_Raul"].keys()):
    assert len(Dict_LoA_split["intersection"]["JPAS_x_DESI_Raul"][key_dset]) == len(Dict_LoA_split["intersection"]["DESI_mocks_Raul"][key_dset]), "Both datasets must have the same number unique TARGETIDs in each of training, validation, and testing sets."
    for jj in range(len(Dict_LoA_split["intersection"]["JPAS_x_DESI_Raul"][key_dset])):
        idx_ = Dict_LoA_split["intersection"]["JPAS_x_DESI_Raul"][key_dset][jj][0]
        tmp_TARGETID = DATA["JPAS_x_DESI_Raul"]['all_pd']["TARGETID"][idx_]
        for kk in range(len(Dict_LoA_split["intersection"]["DESI_mocks_Raul"][key_dset][jj])):
            idx_ = Dict_LoA_split["intersection"]["DESI_mocks_Raul"][key_dset][jj][kk]
            tmp_TARGETID_ = DATA["DESI_mocks_Raul"]['all_pd']["TARGETID"][idx_]
            assert tmp_TARGETID == tmp_TARGETID_, "Both datasets must have the same TARGETIDs in each of training, validation, and testing sets."
        if len(Dict_LoA_split["intersection"]["DESI_mocks_Raul"][key_dset][jj]) > 8:
            print("TARGETID JPAS_x_DESI_Raul:", tmp_TARGETID, "TARGETID DESI_mocks_Raul:", tmp_TARGETID_)

TARGETID JPAS_x_DESI_Raul: 39633286363875916 TARGETID DESI_mocks_Raul: 39633286363875916
TARGETID JPAS_x_DESI_Raul: 39633278969317070 TARGETID DESI_mocks_Raul: 39633278969317070
TARGETID JPAS_x_DESI_Raul: 39633290071639703 TARGETID DESI_mocks_Raul: 39633290071639703
TARGETID JPAS_x_DESI_Raul: 39633290046472425 TARGETID DESI_mocks_Raul: 39633290046472425
TARGETID JPAS_x_DESI_Raul: 39633282656108720 TARGETID DESI_mocks_Raul: 39633282656108720
TARGETID JPAS_x_DESI_Raul: 39633297369727416 TARGETID DESI_mocks_Raul: 39633297369727416
TARGETID JPAS_x_DESI_Raul: 39633282681277567 TARGETID DESI_mocks_Raul: 39633282681277567
TARGETID JPAS_x_DESI_Raul: 39633297348757822 TARGETID DESI_mocks_Raul: 39633297348757822
TARGETID JPAS_x_DESI_Raul: 39633297344561337 TARGETID DESI_mocks_Raul: 39633297344561337
TARGETID JPAS_x_DESI_Raul: 39633290050669510 TARGETID DESI_mocks_Raul: 39633290050669510
TARGETID JPAS_x_DESI_Raul: 39633278948344864 TARGETID DESI_mocks_Raul: 39633278948344864
TARGETID JPAS_x_DESI_

In [10]:
keys_xx = global_setup.keys_load_features['keys_xx']
keys_yy = global_setup.keys_load_features['keys_yy']

In [11]:
key_dset_split = "train"

key_dset = "DESI_mocks_Raul"
key_xmatch = "intersection"
LoA_local_1, xx_1, yy_1 = process_dset_splits.extract_from_block_by_LoA(
    block=DATA[key_dset],
    LoA=Dict_LoA_split[key_xmatch][key_dset][key_dset_split],
    keys_xx=keys_xx,
    keys_yy=keys_yy
)

key_dset = "JPAS_x_DESI_Raul"
key_xmatch = "intersection"
LoA_local_2, xx_2, yy_2 = process_dset_splits.extract_from_block_by_LoA(
    block=DATA[key_dset],
    LoA=Dict_LoA_split[key_xmatch][key_dset][key_dset_split],
    keys_xx=keys_xx,
    keys_yy=keys_yy
)

2025-09-17 14:23:50,969 - INFO - |    ├── 🔧 extract_from_block_by_LoA()
2025-09-17 14:23:50,969 - INFO - |    ├── Using reference key: 'all_observations_normalized'
2025-09-17 14:23:51,311 - INFO - |    ├── Finished extract_from_block_by_LoA()
2025-09-17 14:23:51,312 - INFO - |    ├── 🔧 extract_from_block_by_LoA()
2025-09-17 14:23:51,312 - INFO - |    ├── Using reference key: 'all_observations_normalized'
2025-09-17 14:23:51,358 - INFO - |    ├── Finished extract_from_block_by_LoA()


In [12]:
assert len(LoA_local_1) == len(LoA_local_2), "Both datasets must have the same number unique TARGETIDs"
assert np.unique(yy_1['TARGETID']).shape == np.unique(yy_2['TARGETID']).shape, "Both datasets must have the same number unique TARGETIDs"

for ii in np.arange(0,50):
    if len(LoA_local_1[ii]) > 3:
        for jj in range(len(LoA_local_2[ii])):
            print("JPAS TARGETID", yy_2['TARGETID'][LoA_local_2[ii][jj]])
        for jj in range(len(LoA_local_1[ii])):
            print("DESI TARGETID", yy_1['TARGETID'][LoA_local_1[ii][jj]])
        print()

JPAS TARGETID 39633275215416274
DESI TARGETID 39633275215416274
DESI TARGETID 39633275215416274
DESI TARGETID 39633275215416274
DESI TARGETID 39633275215416274
DESI TARGETID 39633275215416274
DESI TARGETID 39633275215416274

JPAS TARGETID 39633293687130371
DESI TARGETID 39633293687130371
DESI TARGETID 39633293687130371
DESI TARGETID 39633293687130371
DESI TARGETID 39633293687130371
DESI TARGETID 39633293687130371
DESI TARGETID 39633293687130371

JPAS TARGETID 39633278927374212
DESI TARGETID 39633278927374212
DESI TARGETID 39633278927374212
DESI TARGETID 39633278927374212
DESI TARGETID 39633278927374212
DESI TARGETID 39633278927374212
DESI TARGETID 39633278927374212

JPAS TARGETID 39633278935762951
DESI TARGETID 39633278935762951
DESI TARGETID 39633278935762951
DESI TARGETID 39633278935762951
DESI TARGETID 39633278935762951
DESI TARGETID 39633278935762951
DESI TARGETID 39633278935762951

