In [1]:
%load_ext autoreload

import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') # NOTSET, DEBUG, INFO, WARN, ERROR, CRITICAL

from JPAS_DA import global_setup
from JPAS_DA.data import loading_tools
from JPAS_DA.data import cleaning_tools
from JPAS_DA.data import crossmatch_tools
from JPAS_DA.data import process_dset_splits
from JPAS_DA.data import data_loaders

import numpy as np

from JPAS_DA.utils import plotting_utils
import matplotlib as mpl
import matplotlib.pyplot as plt
plt.style.use('default')
plt.close('all')
font, rcnew = plotting_utils.matplotlib_default_config()
mpl.rc('font', **font)
plt.rcParams.update(rcnew)
plt.style.use('tableau-colorblind10')
%matplotlib widget

In [2]:
root_path = global_setup.DATA_path
load_JPAS_x_DESI_Raul   = global_setup.load_JPAS_x_DESI_Raul
load_DESI_mocks_Raul    = global_setup.load_DESI_mocks_Raul
load_Ignasi             = global_setup.load_Ignasi

random_seed_load = global_setup.default_seed

In [3]:
list_of_datasets_to_load = ["JPAS_x_DESI_Raul", "DESI_mocks_Raul"]

In [4]:
DATA = loading_tools.load_data_bundle(
    root_path=root_path,
    include=list_of_datasets_to_load,
    JPAS_x_DESI_Raul={"datasets": load_JPAS_x_DESI_Raul},
    DESI_mocks_Raul={"datasets": load_DESI_mocks_Raul},
    Ignasi={"datasets": load_Ignasi},
    random_seed=random_seed_load,
)

2025-09-17 14:24:03,948 - INFO - 📥 Starting modular dataset loading (load_data_bundle)
2025-09-17 14:24:03,948 - INFO - ├── Loading JPAS_x_DESI_Raul ...
2025-09-17 14:24:03,949 - INFO - ├─── 📥 Starting JPAS_x_DESI_Raul dataset loading...
2025-09-17 14:24:03,949 - INFO - |    ├─── 🔹 Dataset: all (sample 100%)
2025-09-17 14:24:03,996 - INFO - |    |    ✔ CSV loaded: JPAS_DATA_PROPERTIES.csv (shape: (52020, 18))
2025-09-17 14:24:04,008 - INFO - |    |    ✔ NPY loaded: JPAS_DATA_Aper_Cor_3_FLUX+NOISE.npy (obs shape: (52020, 57))
2025-09-17 14:24:04,009 - INFO - ├─── ✅ Finished loading all JPAS datasets.
2025-09-17 14:24:04,010 - INFO - │   ✔ Loaded JPAS_x_DESI_Raul
2025-09-17 14:24:04,010 - INFO - ├── Loading DESI_mocks_Raul ...
2025-09-17 14:24:04,011 - INFO - ├─── 📥 Loading DESI datasets (splitted)...
2025-09-17 14:24:04,011 - INFO - ├─── 📥 Starting DESI dataset loading...
2025-09-17 14:24:04,011 - INFO - |    ├─── 🔹 Dataset: train
2025-09-17 14:24:07,090 - INFO - |    |    ✔ CSV loaded 

In [5]:
config_dict_cleaning = global_setup.config_dict_cleaning

In [6]:
DATA = cleaning_tools.clean_data_pipeline(DATA, config=config_dict_cleaning, in_place=True)

2025-09-17 14:24:11,894 - INFO - 🧹 Cleaning dataset: JPAS_x_DESI_Raul
2025-09-17 14:24:11,894 - INFO - ├── mask_out_unreliable_columns(mask_unreliable_filters_indices=[0, -2])
2025-09-17 14:24:11,903 - INFO - │   ├── Removed columns: [0, 55]
2025-09-17 14:24:11,903 - INFO - │   ├── New #filters: 55
2025-09-17 14:24:11,904 - INFO - │   ├── Updated observations shape: (52020, 55)
2025-09-17 14:24:11,904 - INFO - │   ├── Updated errors shape: (52020, 55)
2025-09-17 14:24:11,904 - INFO - ├── remove_NaNs(check='both', keep_rows_partially_filled_with_NaNs=True)
2025-09-17 14:24:11,905 - INFO - │   ├── rows fully NaN (drop): 0/52020 (0.00%)
2025-09-17 14:24:11,906 - INFO - │   ├── rows with SOME NaNs:   0/52020 (0.00%)
2025-09-17 14:24:11,906 - INFO - │   ├── rows dropped due to policy: 0/52020
2025-09-17 14:24:11,906 - INFO - │   └── final kept: 52020/52020 (100.00%)
2025-09-17 14:24:11,992 - INFO - ├── remove_magic_rows(check='obs', keep_rows_partially_filled_with_magic=True, magic_numbers=

In [7]:
Dict_LoA = {"intersection": {}, "outersection": {}}

IDs1, IDs2, IDs12, \
Dict_LoA["outersection"]["DESI_mocks_Raul"], Dict_LoA["outersection"]["JPAS_x_DESI_Raul"], \
Dict_LoA["intersection"]["DESI_mocks_Raul"], Dict_LoA["intersection"]["JPAS_x_DESI_Raul"] = crossmatch_tools.crossmatch_IDs_two_datasets(
    DATA["DESI_mocks_Raul"]['all_pd']['TARGETID'], DATA["JPAS_x_DESI_Raul"]['all_pd']['TARGETID']
)

2025-09-17 14:24:37,813 - INFO - 🔍 crossmatch_IDs_two_datasets()...
2025-09-17 14:24:37,814 - INFO - ├── 🚀 Starting ID categorization process...
2025-09-17 14:24:37,970 - INFO - |    ├── 📌 Found 1051168 unique IDs across 2 arrays.
2025-09-17 14:24:38,578 - INFO - |    ├── Presence matrix created with shape: (2, 1051168)
2025-09-17 14:24:38,582 - INFO - |    ├── Category mask created with shape: (2, 1051168)
2025-09-17 14:24:38,582 - INFO - ├── 🚀 Starting index retrieval process...
2025-09-17 14:24:38,582 - INFO - |    ├── 📌 Processing 1051168 unique IDs across 2 arrays.
2025-09-17 14:24:39,142 - INFO - ├── 🚀 Starting post-processing of unique IDs across two arrays...
2025-09-17 14:24:39,161 - INFO - |    ├── Processing complete: 1014323 IDs only in Array 1 (96.49%).
2025-09-17 14:24:39,161 - INFO - |    ├── Processing complete: 8 IDs only in Array 2 (0.0%).
2025-09-17 14:24:39,161 - INFO - |    ├── Processing complete: 36837 IDs in both arrays (3.5%).
2025-09-17 14:24:39,162 - INFO - ✅

In [8]:
dict_split_data_options = global_setup.dict_split_data_options

In [9]:
# Split the Lists of Arrays into training, validation, and testing sets
Dict_LoA_split = {"intersection":{}, "outersection":{}}

Dict_LoA_split["intersection"]["JPAS_x_DESI_Raul"] = process_dset_splits.split_LoA(
    Dict_LoA["intersection"]["JPAS_x_DESI_Raul"],
    train_ratio = dict_split_data_options["train_ratio_intersection"],
    val_ratio = dict_split_data_options["val_ratio_intersection"],
    test_ratio = dict_split_data_options["test_ratio_intersection"],
    seed = dict_split_data_options["random_seed_split_intersection"]
)
Dict_LoA_split["outersection"]["DESI_mocks_Raul"] = process_dset_splits.split_LoA(
    Dict_LoA["outersection"]["DESI_mocks_Raul"],
    train_ratio = dict_split_data_options["train_ratio_outersection"],
    val_ratio = dict_split_data_options["val_ratio_outersection"],
    test_ratio = dict_split_data_options["test_ratio_outersection"],
    seed = dict_split_data_options["random_seed_split_outersection"]
)

2025-09-17 14:24:39,201 - INFO - ├── ✂️ Splitting list of arrays (LoA) into train/val/test subsets...
2025-09-17 14:24:39,206 - INFO - ├── Finished splitting.
2025-09-17 14:24:39,206 - INFO - ├── ✂️ Splitting list of arrays (LoA) into train/val/test subsets...
2025-09-17 14:24:39,370 - INFO - ├── Finished splitting.


In [10]:
keys_xx = global_setup.keys_load_features['keys_xx']
keys_yy = global_setup.keys_load_features['keys_yy']

In [18]:
extract_dsets = [
    ("DESI_mocks_Raul", "outersection"),
    ("JPAS_x_DESI_Raul", "intersection")
]

dset_loaders = {}
for key_dset, key_xmatch in extract_dsets:
    dset_loaders.setdefault(key_dset, {})
    for split in global_setup.splits:
        LoA = Dict_LoA_split[key_xmatch][key_dset].get(split, [])
        _, xx, yy = process_dset_splits.extract_from_block_by_LoA(
            block=DATA[key_dset], LoA=LoA, keys_xx=keys_xx, keys_yy=keys_yy
        )
        dset_loaders[key_dset][split] = data_loaders.DataLoader(xx, yy)

2025-09-17 14:55:58,980 - INFO - |    ├── 🔧 extract_from_block_by_LoA()
2025-09-17 14:55:58,981 - INFO - |    ├── Using reference key: 'all_observations_normalized'
2025-09-17 14:56:02,992 - INFO - |    ├── Finished extract_from_block_by_LoA()
2025-09-17 14:56:02,993 - INFO - ├── 💿 Initializing DataLoader object with 2216230 samples...
2025-09-17 14:56:03,016 - INFO - |    ├── 🔧 extract_from_block_by_LoA()
2025-09-17 14:56:03,016 - INFO - |    ├── Using reference key: 'all_observations_normalized'
2025-09-17 14:56:03,938 - INFO - |    ├── Finished extract_from_block_by_LoA()
2025-09-17 14:56:03,957 - INFO - ├── 💿 Initializing DataLoader object with 475057 samples...
2025-09-17 14:56:03,962 - INFO - |    ├── 🔧 extract_from_block_by_LoA()
2025-09-17 14:56:03,962 - INFO - |    ├── Using reference key: 'all_observations_normalized'
2025-09-17 14:56:04,729 - INFO - |    ├── Finished extract_from_block_by_LoA()
2025-09-17 14:56:04,733 - INFO - ├── 💿 Initializing DataLoader object with 475352