In [1]:
# Basic imports
import os
import sys
import time
import pickle
from itertools import product
import warnings

# System path modification
sys.path.insert(0, '..')

# Data handling
import pandas as pd
import numpy as np

# Machine learning imports
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import (
    LinearRegression, Lasso, LassoCV, MultiTaskLasso, MultiTaskLassoCV,
    ElasticNet, ElasticNetCV, MultiTaskElasticNet, MultiTaskElasticNetCV
)
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score

from sklearn.cross_decomposition import PLSRegression
from sklearn.inspection import permutation_importance

# Custom modules
from src.train import *
from src.functions import *
from src.plots import *
from src.dataset import *
from src.multixgboost import *
from src.wrapper import *

# Visualizatiokn 
import matplotlib.pyplot as plt
import seaborn as sns

# Deep learning and machine learning specific 
import torch
from pytorch_tabnet.tab_model import TabNetRegressor
import xgboost as xgb
import shap

from pytorch_tabular.config import DataConfig, TrainerConfig, OptimizerConfig
from pytorch_tabular.models.common.heads import LinearHeadConfig

from pytorch_tabular.models import (
    GatedAdditiveTreeEnsembleConfig,
    DANetConfig,
    TabTransformerConfig,
    FTTransformerConfig,
    TabNetModelConfig,
)

# Ignore warnings
warnings.filterwarnings("ignore")

# Print CUDA availability for PyTorch
print(torch.cuda.is_available())
print(torch.cuda.device_count())

True
1


## Load data 

In [2]:
data = load_pickle_data_palettes()

results_pickle_folder = "../pickle/"

# Unpack data
df_X, df_y, df_all, df_FinalCombination = data["df_X"], data["df_y"], data["df_all"], data["df_FinalCombination"]
dict_select = data["dict_select"]

# Unpack colormaps
full_palette, gender_palette, dx_palette = data["colormaps"].values()

# Train-Test Split

In [3]:
idx_train = list(df_X.isna().any(axis=1))
idx_test = list(~df_X.isna().any(axis=1))

set_intersect_rid = set(df_all[idx_train].RID).intersection(set(df_all[idx_test].RID))
intersect_rid_idx = df_all.RID.isin(set_intersect_rid)

for i, bool_test in enumerate(idx_test): 
    if intersect_rid_idx.iloc[i] & bool_test:
        idx_test[i] = False
        idx_train[i] = True
        
df_X[["APOE_epsilon2", "APOE_epsilon3", "APOE_epsilon4"]] = df_X[["APOE_epsilon2", "APOE_epsilon3", "APOE_epsilon4"]].astype("category")

df_X_train = df_X.loc[idx_train]
df_X_test = df_X.loc[idx_test]

df_y_train = df_y.loc[idx_train]
df_y_test = df_y.loc[idx_test]

c_train = df_all[["AGE", "PTGENDER", "PTEDUCAT"]].iloc[idx_train]
c_test = df_all[["AGE", "PTGENDER", "PTEDUCAT"]].iloc[idx_test]

In [4]:
df_all.SubjectID.iloc[idx_test]

3609    128_S_2002
5631    116_S_4167
5662    033_S_4176
5780    098_S_4215
5950    018_S_4349
6069    941_S_4292
6077    116_S_4453
6085    135_S_4489
6224    033_S_4505
6400    014_S_4576
6429    073_S_4300
7021    003_S_2374
7192    033_S_4179
Name: SubjectID, dtype: object

Define all the models and combinations to try out with their hyperparameters. 

In [5]:
random_state=42
n_imputation_iter = 10

# Define hyperparameters
gain_parameters = {
    'hint_rate': 0.9,
    'alpha': 100,
    'iterations': 1000
}

# Continuous Imputer List (list of tuples with unique strings and corresponding instances)
continuous_imputer_list = [
    ("KNNImputer", KNNImputer(n_neighbors=1)),
]

# Ordinal Imputer List (list of tuples with unique strings and corresponding instances)
ordinal_imputer_list = [
    ("SimpleImputer_most_frequent", SimpleImputer(strategy="most_frequent")),
]

# Predictive Models List (list of tuples with unique strings and corresponding instances)
predictive_models_list = [
    ("LinearRegression", LinearRegression()),
    ("MultiTaskElasticNet", MultiTaskElasticNet()),
    ("MultiTaskElasticNet_tuned", MultiTaskElasticNet(**{'alpha': 0.01, 'l1_ratio': 0.01})),
    ("MultiTaskLasso", MultiTaskLasso()),
    ("MultiTaskLasso_tuned", MultiTaskLasso(**{'alpha': 0.001})),
    ("RandomForestRegressor", RandomForestRegressor()),
    ("XGBoostRegressor", XGBoostRegressor()),
    ("XGBoostRegressor_tuned", XGBoostRegressor(**{'colsample_bytree': 0.5079831261101071, 'learning_rate': 0.0769592094304232, 'max_depth': 6, 'min_child_weight': 7, 'subsample': 0.8049983288913105})),
    ("TabNetRegressor_default", TabNetModelWrapper(n_a=8, n_d=8)),
    ("TabNetRegressor_custom", TabNetModelWrapper(n_a=32, n_d=32)),
    ("PLSRegression_4_components", PLSRegression(n_components=4))
]

# Generate all combinations
combinations = list(product(continuous_imputer_list, ordinal_imputer_list, predictive_models_list))

# Display all combinations
for continuous_imputer, ordinal_imputer, model in combinations:
    print(f"Continuous Imputer: {continuous_imputer[0]}, Ordinal Imputer: {ordinal_imputer[0]}, Model: {model[0]}")

print(f"Combinations of preprocessing and models to test : {len(combinations)}")

Continuous Imputer: KNNImputer, Ordinal Imputer: SimpleImputer_most_frequent, Model: LinearRegression
Continuous Imputer: KNNImputer, Ordinal Imputer: SimpleImputer_most_frequent, Model: MultiTaskElasticNet
Continuous Imputer: KNNImputer, Ordinal Imputer: SimpleImputer_most_frequent, Model: MultiTaskElasticNet_tuned
Continuous Imputer: KNNImputer, Ordinal Imputer: SimpleImputer_most_frequent, Model: MultiTaskLasso
Continuous Imputer: KNNImputer, Ordinal Imputer: SimpleImputer_most_frequent, Model: MultiTaskLasso_tuned
Continuous Imputer: KNNImputer, Ordinal Imputer: SimpleImputer_most_frequent, Model: RandomForestRegressor
Continuous Imputer: KNNImputer, Ordinal Imputer: SimpleImputer_most_frequent, Model: XGBoostRegressor
Continuous Imputer: KNNImputer, Ordinal Imputer: SimpleImputer_most_frequent, Model: XGBoostRegressor_tuned
Continuous Imputer: KNNImputer, Ordinal Imputer: SimpleImputer_most_frequent, Model: TabNetRegressor_default
Continuous Imputer: KNNImputer, Ordinal Imputer: S

In [6]:
# Initialize HDF5 file
results_file = '../pickle/training_2_dict_results.pickle'

if os.path.exists(results_file): 

    with open(results_file, "rb") as input_file:
        all_dict_results = pickle.load(input_file)

else : 
    all_dict_results = []

In [7]:
from src.debug import *
all_dict_results = clean_dict_list(all_dict_results)

Removed due to None value. Other values: {'params': {'ordinal_imputer': 'NoImputer', 'continuous_imputer': 'NoImputer', 'model': 'LinearRegression', 'train_shape': (2881, 200), 'test_shape': (13, 200)}, 'fitting_time': 0.1571178436279297, 'results_adj': {'mse_score': array([1.11192349, 0.67668626, 0.61999815, 0.97750844]), 'mae_score': array([0.72849831, 0.70235973, 0.70986192, 0.71215901]), 'r2': array([-0.12379655,  0.2321367 , -0.29452346, -0.18906157]), 'explained_variance': array([ 0.02475189,  0.28845771, -0.29319881,  0.10973059]), 'corr': array([0.33204585, 0.54765514, 0.01649677, 0.38058137])}, 'results_org': {'mse_score': array([1.11192349, 0.67668626, 0.61999817, 0.97750843]), 'mae_score': array([0.72849832, 0.70235973, 0.70986193, 0.712159  ]), 'r2': array([-0.18280006,  0.24192465, -0.20755938, -0.14289477]), 'explained_variance': array([-0.02645227,  0.29752774, -0.20632372,  0.14429641]), 'corr': array([0.27766053, 0.55012684, 0.07284141, 0.39622517])}}
Removed due to No

In [8]:
for continuous_imputer, ordinal_imputer, model in combinations:
    name_continuous_imputer, continuous_imputer_instance = continuous_imputer
    name_ordinal_imputer, ordinal_imputer_instance = ordinal_imputer
    name_model, model_instance = model

    params = {
        "ordinal_imputer": name_ordinal_imputer, 
        "continuous_imputer": name_continuous_imputer, 
        "model": name_model, "train_shape" : df_X_train.shape, 
        "test_shape": df_X_test.shape
    }

    if any(result['params'] == params for result in all_dict_results):
        # Skip this iteration if the combination exists
        print(f"Skipping existing combination: {params.values()}")
        
        continue

    try: 
    
        # Now you can call your `train_model` function with these components
        dict_results = train_imputer_model(
            df_X_train, df_X_test, df_y_train, df_y_test,
            c_train, c_test,
            ordinal_imputer_instance, name_ordinal_imputer,
            continuous_imputer_instance, name_continuous_imputer,
            model_instance, name_model,
            separate_imputers=True  # Or however you want to specify
        )

    except Exception as e:  

        print(e)
    
        dict_results = {
        "params": params, 
        "imputation_time": None,
        "fitting_time": None, 
        "results_adj": None, 
        "results_org": None
    }
        
    # Optionally keep the all_dict_results list updated
    all_dict_results.append(dict_results)

        # Save the updated results back to the pickle file
    with open(results_file, 'wb') as f:
        pickle.dump(all_dict_results, f)


Skipping existing combination: dict_values(['SimpleImputer_most_frequent', 'KNNImputer', 'LinearRegression', (2881, 348), (13, 348)])
Skipping existing combination: dict_values(['SimpleImputer_most_frequent', 'KNNImputer', 'MultiTaskElasticNet', (2881, 348), (13, 348)])
Skipping existing combination: dict_values(['SimpleImputer_most_frequent', 'KNNImputer', 'MultiTaskElasticNet_tuned', (2881, 348), (13, 348)])
Skipping existing combination: dict_values(['SimpleImputer_most_frequent', 'KNNImputer', 'MultiTaskLasso', (2881, 348), (13, 348)])
Skipping existing combination: dict_values(['SimpleImputer_most_frequent', 'KNNImputer', 'MultiTaskLasso_tuned', (2881, 348), (13, 348)])
Skipping existing combination: dict_values(['SimpleImputer_most_frequent', 'KNNImputer', 'RandomForestRegressor', (2881, 348), (13, 348)])
Using separate imputers for ordinal and continuous data.
No NaN in test data -> Keep as it is. 
DataFrame.dtypes for data must be int, float, bool or category. When categorical 

In [9]:
# Store data (serialize)
with open(results_file, 'wb') as handle:
    pickle.dump(all_dict_results, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [10]:
with open('../pickle/training_2_dict_results.pickle', "rb") as input_file:
    dict_results_split = pickle.load(input_file)

In [11]:
dict_results_split

[{'params': {'ordinal_imputer': 'SimpleImputer_most_frequent',
   'continuous_imputer': 'KNNImputer',
   'model': 'LinearRegression',
   'train_shape': (2881, 348),
   'test_shape': (13, 348)},
  'imputation_time': 3.2466392517089844,
  'fitting_time': 0.22939062118530273,
  'results_adj': {'mse_score': array([0.65983176, 0.48974311, 0.40951464, 0.56684282]),
   'mae_score': array([0.63965764, 0.55144938, 0.56535002, 0.59999667]),
   'r2': array([0.33312259, 0.44426866, 0.144955  , 0.31048062]),
   'explained_variance': array([0.36730884, 0.47760179, 0.14545957, 0.43985495]),
   'corr': array([0.60633336, 0.69220668, 0.4020632 , 0.66605104])},
  'results_org': {'mse_score': array([0.65983176, 0.4897431 , 0.40951466, 0.56684279]),
   'mae_score': array([0.63965764, 0.55144938, 0.56535003, 0.59999666]),
   'r2': array([0.29810904, 0.45135258, 0.20239561, 0.33725209]),
   'explained_variance': array([0.33409019, 0.48426081, 0.20286628, 0.46160331]),
   'corr': array([0.57814397, 0.6981620

## Add Pytorch Tabular models as well 

In [12]:
ordinal_features = ['APOE_epsilon2', 'APOE_epsilon3', 'APOE_epsilon4']
continuous_features = [col for col in df_X_train.columns if col not in ordinal_features]

# Prepare Tabular configurations (shared for all PyTorch models)
data_config = DataConfig(
    target=df_y_train.columns.tolist(),
    continuous_cols=continuous_features,
    categorical_cols=ordinal_features
)
trainer_config = TrainerConfig(
    batch_size=1024, max_epochs=1, auto_lr_find=True,
    early_stopping="valid_loss", early_stopping_mode="min", early_stopping_patience=5,
    checkpoints="valid_loss", load_best=True, progress_bar="nones",
)
optimizer_config = OptimizerConfig()
head_config = LinearHeadConfig(dropout=0.1).__dict__

In [13]:
predictive_models_list += [
    ("GatedAdditiveTreeEnsembleConfig_tab", 
    TabularModelWrapper(
        GatedAdditiveTreeEnsembleConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        gflu_stages=6,
        gflu_dropout=0.0,
        tree_depth=5,
        num_trees=20,
        chain_trees=False,
        share_head_weights=True), data_config, trainer_config, optimizer_config 
    )),
    ("DANetConfig_tab",
    TabularModelWrapper(
        DANetConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        n_layers=8,
        k=5,
        dropout_rate=0.1), data_config, trainer_config, optimizer_config
    )),
    ("TabTransformerConfig_tab",
        TabularModelWrapper(
        TabTransformerConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        embedding_initialization="kaiming_uniform",
        embedding_bias=False), data_config, trainer_config, optimizer_config
    )),
    # ("FTTransformerConfig",
    #     TabularModelWrapper(
    #     FTTransformerConfig(
    #     task="regression",
    #     head="LinearHead",
    #     head_config=head_config), data_config, trainer_config, optimizer_config
    # )),
    ("TabNetModelConfig_tab",
        TabularModelWrapper(
        TabNetModelConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        n_d=8,
        n_a=8,
        n_steps=3,
        gamma=1.3,
        n_independent=2,
        n_shared=2), data_config, trainer_config, optimizer_config
    )),
]

# Generate all combinations
combinations = list(product(continuous_imputer_list, ordinal_imputer_list, predictive_models_list))

for continuous_imputer, ordinal_imputer, model in combinations:
    name_continuous_imputer, continuous_imputer_instance = continuous_imputer
    name_ordinal_imputer, ordinal_imputer_instance = ordinal_imputer
    name_model, model_instance = model

    params = {
        "ordinal_imputer": name_ordinal_imputer, 
        "continuous_imputer": name_continuous_imputer, 
        "model": name_model, "train_shape" : df_X_train.shape, 
        "test_shape": df_X_test.shape
    }

    if any(result['params'] == params for result in all_dict_results):
        # Skip this iteration if the combination exists
        print(f"Skipping existing combination: {params.values()}")
        
        continue

    try: 
        print(name_model)
        
        # Now you can call your `train_model` function with these components
        dict_results = train_imputer_model(
            df_X_train, df_X_test, df_y_train, df_y_test,
            c_train, c_test,
            ordinal_imputer_instance, name_ordinal_imputer,
            continuous_imputer_instance, name_continuous_imputer,
            model_instance, name_model,
            separate_imputers=True  # Or however you want to specify
        )

    except Exception as e:  

        print(e)
    
        dict_results = {
        "params": params, 
        "imputation_time": None,
        "fitting_time": None, 
        "results_adj": None, 
        "results_org": None
    }
        
    # Optionally keep the all_dict_results list updated
    all_dict_results.append(dict_results)

        # Save the updated results back to the pickle file
    with open(results_file, 'wb') as f:
        pickle.dump(all_dict_results, f)

Skipping existing combination: dict_values(['SimpleImputer_most_frequent', 'KNNImputer', 'LinearRegression', (2881, 348), (13, 348)])
Skipping existing combination: dict_values(['SimpleImputer_most_frequent', 'KNNImputer', 'MultiTaskElasticNet', (2881, 348), (13, 348)])
Skipping existing combination: dict_values(['SimpleImputer_most_frequent', 'KNNImputer', 'MultiTaskElasticNet_tuned', (2881, 348), (13, 348)])
Skipping existing combination: dict_values(['SimpleImputer_most_frequent', 'KNNImputer', 'MultiTaskLasso', (2881, 348), (13, 348)])
Skipping existing combination: dict_values(['SimpleImputer_most_frequent', 'KNNImputer', 'MultiTaskLasso_tuned', (2881, 348), (13, 348)])
Skipping existing combination: dict_values(['SimpleImputer_most_frequent', 'KNNImputer', 'RandomForestRegressor', (2881, 348), (13, 348)])
Skipping existing combination: dict_values(['SimpleImputer_most_frequent', 'KNNImputer', 'XGBoostRegressor', (2881, 348), (13, 348)])
Skipping existing combination: dict_values(

Seed set to 42


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


You are using a CUDA device ('NVIDIA RTX 6000 Ada Generation') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.
LR finder stopped early after 3 steps due to diverging loss.
Failed to compute suggestion for learning rate because there are not enough points. Increase the loop iteration limits or the size of your dataset/dataloader.
Restoring states from the checkpoint path at /home/cschneuwly/Documents/projects/optimus/notebooks/.lr_find_df83e326-5e12-4be6-98c6-05fcd8a4a59f.ckpt
Restored all states from the checkpoint at /home/cschneuwly/Documents/projects/optimus/notebooks/.lr_find_df83e326-5e12-4be6-98c6-05fcd8a4a59f.ckpt
Failed to compute suggestion for learning rate because there are not enough points. Increase the loop iteration limits or the size of your dataset/dataloader.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name             | Type                       | Params | Mode 
------------------------------------------------------------------------
0 | _backbone        | GatedAdditiveTreesBackbone | 5.6 M  | train
1 | _embedding_layer | Embedding1dLayer           | 714    | train
2 | _head            | CustomHead                 | 156    | train
3 | loss             | MSELoss                    | 0      | train
------------------------------------------------------------------------
5.6 M     Trainable params
0         Non-trainable params
5.6 M     Total params
22.286    Total estimated model params size (MB)
692       Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL omegaconf.dictconfig.DictConfig was not an allowed global by default. Please use `torch.serialization.add_safe_globals([omegaconf.dictconfig.DictConfig])` or the `torch.serialization.safe_globals([omegaconf.dictconfig.DictConfig])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about 

Seed set to 42


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.
LR finder stopped early after 3 steps due to diverging loss.
Failed to compute suggestion for learning rate because there are not enough points. Increase the loop iteration limits or the size of your dataset/dataloader.
Restoring states from the checkpoint path at /home/cschneuwly/Documents/projects/optimus/notebooks/.lr_find_7689edd2-afc9-4094-813c-d03675ff9359.ckpt
Restored all states from the checkpoint at /home/cschneuwly/Documents/projects/optimus/notebooks/.lr_find_7689edd2-afc9-4094-813c-d03675ff9359.ckpt
Failed to compute suggestion for learning rate because there are not enough points. Increase the loop iteration limits or the size of your dataset/dataloader.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name             | Type             | Params | Mode 
--------------------------------------------------------------
0 | _backbone        | DANetBackbone    | 2.3 M  | train
1 | _embedding_layer | Embedding1dLayer | 714    | train
2 | _head            | LinearHead       | 260    | train
3 | loss             | MSELoss          | 0      | train
--------------------------------------------------------------
2.3 M     Trainable params
0         Non-trainable params
2.3 M     Total params
9.101     Total estimated model params size (MB)
159       Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL omegaconf.dictconfig.DictConfig was not an allowed global by default. Please use `torch.serialization.add_safe_globals([omegaconf.dictconfig.DictConfig])` or the `torch.serialization.safe_globals([omegaconf.dictconfig.DictConfig])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about 

Seed set to 42


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.
LR finder stopped early after 3 steps due to diverging loss.
Failed to compute suggestion for learning rate because there are not enough points. Increase the loop iteration limits or the size of your dataset/dataloader.
Restoring states from the checkpoint path at /home/cschneuwly/Documents/projects/optimus/notebooks/.lr_find_169ca75e-9237-4b14-b824-0e762c907f36.ckpt
Restored all states from the checkpoint at /home/cschneuwly/Documents/projects/optimus/notebooks/.lr_find_169ca75e-9237-4b14-b824-0e762c907f36.ckpt
Failed to compute suggestion for learning rate because there are not enough points. Increase the loop iteration limits or the size of your dataset/dataloader.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name             | Type                   | Params | Mode 
--------------------------------------------------------------------
0 | _backbone        | TabTransformerBackbone | 271 K  | train
1 | _embedding_layer | Embedding2dLayer       | 408    | train
2 | _head            | LinearHead             | 1.8 K  | train
3 | loss             | MSELoss                | 0      | train
--------------------------------------------------------------------
274 K     Trainable params
0         Non-trainable params
274 K     Total params
1.097     Total estimated model params size (MB)
125       Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL omegaconf.dictconfig.DictConfig was not an allowed global by default. Please use `torch.serialization.add_safe_globals([omegaconf.dictconfig.DictConfig])` or the `torch.serialization.safe_globals([omegaconf.dictconfig.DictConfig])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about 

Seed set to 42


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.
LR finder stopped early after 3 steps due to diverging loss.
Failed to compute suggestion for learning rate because there are not enough points. Increase the loop iteration limits or the size of your dataset/dataloader.
Restoring states from the checkpoint path at /home/cschneuwly/Documents/projects/optimus/notebooks/.lr_find_39f71446-d1b3-4079-9553-67a326a9ef0b.ckpt
Restored all states from the checkpoint at /home/cschneuwly/Documents/projects/optimus/notebooks/.lr_find_39f71446-d1b3-4079-9553-67a326a9ef0b.ckpt
Failed to compute suggestion for learning rate because there are not enough points. Increase the loop iteration limits or the size of your dataset/dataloader.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name             | Type           | Params | Mode 
------------------------------------------------------------
0 | _embedding_layer | Identity       | 0      | train
1 | _backbone        | TabNetBackbone | 28.8 K | train
2 | _head            | Identity       | 0      | train
3 | loss             | MSELoss        | 0      | train
------------------------------------------------------------
28.8 K    Trainable params
0         Non-trainable params
28.8 K    Total params
0.115     Total estimated model params size (MB)
111       Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL omegaconf.dictconfig.DictConfig was not an allowed global by default. Please use `torch.serialization.add_safe_globals([omegaconf.dictconfig.DictConfig])` or the `torch.serialization.safe_globals([omegaconf.dictconfig.DictConfig])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about 

# Train models only on MRI features to compare performances

## Test train split

In [14]:
idx_train = list(df_X.isna().any(axis=1))
idx_test = list(~df_X.isna().any(axis=1))

set_intersect_rid = set(df_all[idx_train].RID).intersection(set(df_all[idx_test].RID))
intersect_rid_idx = df_all.RID.isin(set_intersect_rid)

for i, bool_test in enumerate(idx_test): 
    if intersect_rid_idx.iloc[i] & bool_test:
        idx_test[i] = False
        idx_train[i] = True

In [15]:
df_X[["APOE_epsilon2", "APOE_epsilon3", "APOE_epsilon4"]] = df_X[["APOE_epsilon2", "APOE_epsilon3", "APOE_epsilon4"]].astype("category")

In [16]:
df_X_train = df_X[dict_select["MRIth"]].loc[idx_train]
df_X_test = df_X[dict_select["MRIth"]].loc[idx_test]

df_y_train = df_y.loc[idx_train]
df_y_test = df_y.loc[idx_test]

c_train = df_all[["AGE", "PTGENDER", "PTEDUCAT"]].iloc[idx_train]
c_test = df_all[["AGE", "PTGENDER", "PTEDUCAT"]].iloc[idx_test]

In [17]:
random_state=42
n_imputation_iter = 10

# Define hyperparameters
gain_parameters = {
    'hint_rate': 0.9,
    'alpha': 100,
    'iterations': 1000
}

# Continuous Imputer List (list of tuples with unique strings and corresponding instances)
continuous_imputer_list = [
    ("NoImputer", KNNImputer(n_neighbors=1)),

]

# Ordinal Imputer List (list of tuples with unique strings and corresponding instances)
ordinal_imputer_list = [
    ("NoImputer", SimpleImputer(strategy="most_frequent")),
]

# Predictive Models List (list of tuples with unique strings and corresponding instances)
predictive_models_list = [
    ("LinearRegression", LinearRegression()),
    ("MultiTaskElasticNet", MultiTaskElasticNet()),
    ("MultiTaskElasticNet_tuned", MultiTaskElasticNet(**{'alpha': 0.01, 'l1_ratio': 0.01})),
    ("MultiTaskLasso", MultiTaskLasso()),
    ("MultiTaskLasso_tuned", MultiTaskLasso(**{'alpha': 0.001})),
    ("RandomForestRegressor", RandomForestRegressor()),
    ("XGBoostRegressor", XGBoostRegressor()),
    ("XGBoostRegressor_tuned", XGBoostRegressor(**{'colsample_bytree': 0.5079831261101071, 'learning_rate': 0.0769592094304232, 'max_depth': 6, 'min_child_weight': 7, 'subsample': 0.8049983288913105})),
    ("TabNetRegressor_default", TabNetModelWrapper(n_a=8, n_d=8)),
    ("TabNetRegressor_custom", TabNetModelWrapper(n_a=32, n_d=32)),
    ("PLSRegression_4_components", PLSRegression(n_components=4))
]


# Generate all combinations
combinations = list(product(continuous_imputer_list, ordinal_imputer_list, predictive_models_list))

# Display all combinations
for continuous_imputer, ordinal_imputer, model in combinations:
    print(f"Continuous Imputer: {continuous_imputer[0]}, Ordinal Imputer: {ordinal_imputer[0]}, Model: {model[0]}")

print(f"Combinations of preprocessing and models to test : {len(combinations)}")

Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: LinearRegression
Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: MultiTaskElasticNet
Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: MultiTaskElasticNet_tuned
Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: MultiTaskLasso
Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: MultiTaskLasso_tuned
Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: RandomForestRegressor
Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: XGBoostRegressor
Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: XGBoostRegressor_tuned
Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: TabNetRegressor_default
Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: TabNetRegressor_custom
Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: PLSRegression_4_components
Combinations of preprocessing and model

In [18]:
# Initialize HDF5 file
results_file = '../pickle/training_2_dict_results.pickle'

with open(results_file, "rb") as input_file:
    all_dict_results = pickle.load(input_file)

In [19]:
for continuous_imputer, ordinal_imputer, model in combinations:
    name_continuous_imputer, continuous_imputer_instance = continuous_imputer
    name_ordinal_imputer, ordinal_imputer_instance = ordinal_imputer
    name_model, model_instance = model

    try: 
    
        # Now you can call your `train_model` function with these components
        dict_results = train_imputer_model(
            df_X_train, df_X_test, df_y_train, df_y_test,
            c_train, c_test,
            ordinal_imputer_instance, name_ordinal_imputer,
            continuous_imputer_instance, name_continuous_imputer,
            model_instance, name_model,
            separate_imputers=True  # Or however you want to specify
        )

    except Exception as e:  

        print(e)
    
        params = {
        "ordinal_imputer": name_ordinal_imputer, 
        "continuous_imputer": name_continuous_imputer, 
        "model": name_model, "train_shape" : df_X_train.shape, 
        "test_shape": df_X_test.shape
    }
        dict_results = {
        "params": params, 
        "imputation_time": None,
        "fitting_time": None, 
        "results_adj": None, 
        "results_org": None
    }
        
    # Optionally keep the all_dict_results list updated
    all_dict_results.append(dict_results)

    # Save the updated results back to the pickle file
    with open(results_file, 'wb') as f:
        pickle.dump(all_dict_results, f)

Using separate imputers for ordinal and continuous data.
No NaN in train data -> Keep as it is. 
No NaN in test data -> Keep as it is. 
Using separate imputers for ordinal and continuous data.
No NaN in train data -> Keep as it is. 
No NaN in test data -> Keep as it is. 
Using separate imputers for ordinal and continuous data.
No NaN in train data -> Keep as it is. 
No NaN in test data -> Keep as it is. 
Using separate imputers for ordinal and continuous data.
No NaN in train data -> Keep as it is. 
No NaN in test data -> Keep as it is. 
Using separate imputers for ordinal and continuous data.
No NaN in train data -> Keep as it is. 
No NaN in test data -> Keep as it is. 
Using separate imputers for ordinal and continuous data.
No NaN in train data -> Keep as it is. 
No NaN in test data -> Keep as it is. 
Using separate imputers for ordinal and continuous data.
No NaN in train data -> Keep as it is. 
No NaN in test data -> Keep as it is. 
Using separate imputers for ordinal and continuo

In [20]:
# Store data (serialize)
with open(results_file, 'wb') as handle:
    pickle.dump(all_dict_results, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Add Pytorch models only on MRI features

In [21]:
ordinal_features = ['APOE_epsilon2', 'APOE_epsilon3', 'APOE_epsilon4']
continuous_features = [col for col in df_X_train.columns if col not in ordinal_features]

# Prepare Tabular configurations (shared for all PyTorch models)
data_config = DataConfig(
    target=df_y_train.columns.tolist(),
    continuous_cols=continuous_features,
    categorical_cols=[]
)
trainer_config = TrainerConfig(
    batch_size=1024, max_epochs=1, auto_lr_find=True,
    early_stopping="valid_loss", early_stopping_mode="min", early_stopping_patience=5,
    checkpoints="valid_loss", load_best=True, progress_bar="nones",
)
optimizer_config = OptimizerConfig()
head_config = LinearHeadConfig(dropout=0.1).__dict__

In [22]:
predictive_models_list += [
    ("GatedAdditiveTreeEnsembleConfig_tab", 
    TabularModelWrapper(
        GatedAdditiveTreeEnsembleConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        gflu_stages=6,
        gflu_dropout=0.0,
        tree_depth=5,
        num_trees=20,
        chain_trees=False,
        share_head_weights=True), data_config, trainer_config, optimizer_config 
    )),
    ("DANetConfig_tab",
    TabularModelWrapper(
        DANetConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        n_layers=8,
        k=5,
        dropout_rate=0.1), data_config, trainer_config, optimizer_config
    )),
    ("TabTransformerConfig_tab",
        TabularModelWrapper(
        TabTransformerConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        embedding_initialization="kaiming_uniform",
        embedding_bias=False), data_config, trainer_config, optimizer_config
    )),
    # ("FTTransformerConfig",
    #     TabularModelWrapper(
    #     FTTransformerConfig(
    #     task="regression",
    #     head="LinearHead",
    #     head_config=head_config), data_config, trainer_config, optimizer_config
    # )),
    ("TabNetModelConfig_tab",
        TabularModelWrapper(
        TabNetModelConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        n_d=8,
        n_a=8,
        n_steps=3,
        gamma=1.3,
        n_independent=2,
        n_shared=2), data_config, trainer_config, optimizer_config
    )),
]

# Generate all combinations
combinations = list(product(continuous_imputer_list, ordinal_imputer_list, predictive_models_list))

for continuous_imputer, ordinal_imputer, model in combinations:
    name_continuous_imputer, continuous_imputer_instance = continuous_imputer
    name_ordinal_imputer, ordinal_imputer_instance = ordinal_imputer
    name_model, model_instance = model

    params = {
        "ordinal_imputer": name_ordinal_imputer, 
        "continuous_imputer": name_continuous_imputer, 
        "model": name_model, "train_shape" : df_X_train.shape, 
        "test_shape": df_X_test.shape
    }

    if any(result['params'] == params for result in all_dict_results):
        # Skip this iteration if the combination exists
        print(f"Skipping existing combination: {params.values()}")
        
        continue

    try: 
        print(name_model)
        
        # Now you can call your `train_model` function with these components
        dict_results = train_imputer_model(
            df_X_train, df_X_test, df_y_train, df_y_test,
            c_train, c_test,
            ordinal_imputer_instance, name_ordinal_imputer,
            continuous_imputer_instance, name_continuous_imputer,
            model_instance, name_model,
            separate_imputers=True  # Or however you want to specify
        )

    except Exception as e:  

        print(e)
    
        dict_results = {
        "params": params, 
        "imputation_time": None,
        "fitting_time": None, 
        "results_adj": None, 
        "results_org": None
    }
        
    # Optionally keep the all_dict_results list updated
    all_dict_results.append(dict_results)

        # Save the updated results back to the pickle file
    with open(results_file, 'wb') as f:
        pickle.dump(all_dict_results, f)

Skipping existing combination: dict_values(['NoImputer', 'NoImputer', 'LinearRegression', (2881, 200), (13, 200)])
Skipping existing combination: dict_values(['NoImputer', 'NoImputer', 'MultiTaskElasticNet', (2881, 200), (13, 200)])
Skipping existing combination: dict_values(['NoImputer', 'NoImputer', 'MultiTaskElasticNet_tuned', (2881, 200), (13, 200)])
Skipping existing combination: dict_values(['NoImputer', 'NoImputer', 'MultiTaskLasso', (2881, 200), (13, 200)])
Skipping existing combination: dict_values(['NoImputer', 'NoImputer', 'MultiTaskLasso_tuned', (2881, 200), (13, 200)])
Skipping existing combination: dict_values(['NoImputer', 'NoImputer', 'RandomForestRegressor', (2881, 200), (13, 200)])
Skipping existing combination: dict_values(['NoImputer', 'NoImputer', 'XGBoostRegressor', (2881, 200), (13, 200)])
Skipping existing combination: dict_values(['NoImputer', 'NoImputer', 'XGBoostRegressor_tuned', (2881, 200), (13, 200)])
Skipping existing combination: dict_values(['NoImputer'

Seed set to 42


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.
LR finder stopped early after 3 steps due to diverging loss.
Failed to compute suggestion for learning rate because there are not enough points. Increase the loop iteration limits or the size of your dataset/dataloader.
Restoring states from the checkpoint path at /home/cschneuwly/Documents/projects/optimus/notebooks/.lr_find_d16bcaca-1262-4984-bb48-d655e08cd046.ckpt
Restored all states from the checkpoint at /home/cschneuwly/Documents/projects/optimus/notebooks/.lr_find_d16bcaca-1262-4984-bb48-d655e08cd046.ckpt
Failed to compute suggestion for learning rate because there are not enough points. Increase the loop iteration limits or the size of your dataset/dataloader.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name             | Type                       | Params | Mode 
------------------------------------------------------------------------
0 | _backbone        | GatedAdditiveTreesBackbone | 2.1 M  | train
1 | _embedding_layer | Embedding1dLayer           | 400    | train
2 | _head            | CustomHead                 | 156    | train
3 | loss             | MSELoss                    | 0      | train
------------------------------------------------------------------------
2.1 M     Trainable params
0         Non-trainable params
2.1 M     Total params
8.417     Total estimated model params size (MB)
689       Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL omegaconf.dictconfig.DictConfig was not an allowed global by default. Please use `torch.serialization.add_safe_globals([omegaconf.dictconfig.DictConfig])` or the `torch.serialization.safe_globals([omegaconf.dictconfig.DictConfig])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about 

Seed set to 42


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.
LR finder stopped early after 3 steps due to diverging loss.
Failed to compute suggestion for learning rate because there are not enough points. Increase the loop iteration limits or the size of your dataset/dataloader.
Restoring states from the checkpoint path at /home/cschneuwly/Documents/projects/optimus/notebooks/.lr_find_ead3d129-337d-4460-8ec5-94e4a743580e.ckpt
Restored all states from the checkpoint at /home/cschneuwly/Documents/projects/optimus/notebooks/.lr_find_ead3d129-337d-4460-8ec5-94e4a743580e.ckpt
Failed to compute suggestion for learning rate because there are not enough points. Increase the loop iteration limits or the size of your dataset/dataloader.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name             | Type             | Params | Mode 
--------------------------------------------------------------
0 | _backbone        | DANetBackbone    | 1.4 M  | train
1 | _embedding_layer | Embedding1dLayer | 400    | train
2 | _head            | LinearHead       | 260    | train
3 | loss             | MSELoss          | 0      | train
--------------------------------------------------------------
1.4 M     Trainable params
0         Non-trainable params
1.4 M     Total params
5.787     Total estimated model params size (MB)
156       Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL omegaconf.dictconfig.DictConfig was not an allowed global by default. Please use `torch.serialization.add_safe_globals([omegaconf.dictconfig.DictConfig])` or the `torch.serialization.safe_globals([omegaconf.dictconfig.DictConfig])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about 

Seed set to 42


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.
LR finder stopped early after 3 steps due to diverging loss.
Failed to compute suggestion for learning rate because there are not enough points. Increase the loop iteration limits or the size of your dataset/dataloader.
Restoring states from the checkpoint path at /home/cschneuwly/Documents/projects/optimus/notebooks/.lr_find_34342d7a-0b99-4b6b-8b04-22620c8a8d9f.ckpt
Restored all states from the checkpoint at /home/cschneuwly/Documents/projects/optimus/notebooks/.lr_find_34342d7a-0b99-4b6b-8b04-22620c8a8d9f.ckpt
Failed to compute suggestion for learning rate because there are not enough points. Increase the loop iteration limits or the size of your dataset/dataloader.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name             | Type                   | Params | Mode 
--------------------------------------------------------------------
0 | _backbone        | TabTransformerBackbone | 271 K  | train
1 | _embedding_layer | Embedding2dLayer       | 0      | train
2 | _head            | LinearHead             | 804    | train
3 | loss             | MSELoss                | 0      | train
--------------------------------------------------------------------
272 K     Trainable params
0         Non-trainable params
272 K     Total params
1.090     Total estimated model params size (MB)
119       Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL omegaconf.dictconfig.DictConfig was not an allowed global by default. Please use `torch.serialization.add_safe_globals([omegaconf.dictconfig.DictConfig])` or the `torch.serialization.safe_globals([omegaconf.dictconfig.DictConfig])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about 

Seed set to 42


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.
LR finder stopped early after 3 steps due to diverging loss.
Failed to compute suggestion for learning rate because there are not enough points. Increase the loop iteration limits or the size of your dataset/dataloader.
Restoring states from the checkpoint path at /home/cschneuwly/Documents/projects/optimus/notebooks/.lr_find_760bab5e-2abb-4245-97e3-e87fb4095c20.ckpt
Restored all states from the checkpoint at /home/cschneuwly/Documents/projects/optimus/notebooks/.lr_find_760bab5e-2abb-4245-97e3-e87fb4095c20.ckpt
Failed to compute suggestion for learning rate because there are not enough points. Increase the loop iteration limits or the size of your dataset/dataloader.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name             | Type           | Params | Mode 
------------------------------------------------------------
0 | _embedding_layer | Identity       | 0      | train
1 | _backbone        | TabNetBackbone | 18.9 K | train
2 | _head            | Identity       | 0      | train
3 | loss             | MSELoss        | 0      | train
------------------------------------------------------------
18.9 K    Trainable params
0         Non-trainable params
18.9 K    Total params
0.075     Total estimated model params size (MB)
107       Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL omegaconf.dictconfig.DictConfig was not an allowed global by default. Please use `torch.serialization.add_safe_globals([omegaconf.dictconfig.DictConfig])` or the `torch.serialization.safe_globals([omegaconf.dictconfig.DictConfig])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about 

# Print Table for reporting

In [23]:
results_file = "../pickle/training_2_dict_results.pickle"

In [24]:
with open(results_file, "rb") as input_file:
    all_dict_results = pickle.load(input_file)

In [25]:
predictive_models_list += [
    ("GatedAdditiveTreeEnsembleConfig_tab", 
    TabularModelWrapper(
        GatedAdditiveTreeEnsembleConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        gflu_stages=6,
        gflu_dropout=0.0,
        tree_depth=5,
        num_trees=20,
        chain_trees=False,
        share_head_weights=True), data_config, trainer_config, optimizer_config 
    )),
    ("DANetConfig_tab",
    TabularModelWrapper(
        DANetConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        n_layers=8,
        k=5,
        dropout_rate=0.1), data_config, trainer_config, optimizer_config
    )),
    ("TabTransformerConfig_tab",
        TabularModelWrapper(
        TabTransformerConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        embedding_initialization="kaiming_uniform",
        embedding_bias=False), data_config, trainer_config, optimizer_config
    )),
    # ("FTTransformerConfig",
    #     TabularModelWrapper(
    #     FTTransformerConfig(
    #     task="regression",
    #     head="LinearHead",
    #     head_config=head_config), data_config, trainer_config, optimizer_config
    # )),
    ("TabNetModelConfig_tab",
        TabularModelWrapper(
        TabNetModelConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        n_d=8,
        n_a=8,
        n_steps=3,
        gamma=1.3,
        n_independent=2,
        n_shared=2), data_config, trainer_config, optimizer_config
    )),
]

# Generate all combinations
combinations = list(product(continuous_imputer_list, ordinal_imputer_list, predictive_models_list))

for continuous_imputer, ordinal_imputer, model in combinations:
    name_continuous_imputer, continuous_imputer_instance = continuous_imputer
    name_ordinal_imputer, ordinal_imputer_instance = ordinal_imputer
    name_model, model_instance = model

    params = {
        "ordinal_imputer": name_ordinal_imputer, 
        "continuous_imputer": name_continuous_imputer, 
        "model": name_model, "train_shape" : df_X_train.shape, 
        "test_shape": df_X_test.shape
    }

    if any(result['params'] == params for result in all_dict_results):
        # Skip this iteration if the combination exists
        print(f"Skipping existing combination: {params.values()}")
        
        continue

    try: 
        print(name_model)
        
        # Now you can call your `train_model` function with these components
        dict_results = train_imputer_model(
            df_X_train, df_X_test, df_y_train, df_y_test,
            c_train, c_test,
            ordinal_imputer_instance, name_ordinal_imputer,
            continuous_imputer_instance, name_continuous_imputer,
            model_instance, name_model,
            separate_imputers=True  # Or however you want to specify
        )

    except Exception as e:  

        print(e)
    
        dict_results = {
        "params": params, 
        "imputation_time": None,
        "fitting_time": None, 
        "results_adj": None, 
        "results_org": None
    }
        
    # Optionally keep the all_dict_results list updated
    all_dict_results.append(dict_results)

        # Save the updated results back to the pickle file
    with open(results_file, 'wb') as f:
        pickle.dump(all_dict_results, f)

Skipping existing combination: dict_values(['NoImputer', 'NoImputer', 'LinearRegression', (2881, 200), (13, 200)])
Skipping existing combination: dict_values(['NoImputer', 'NoImputer', 'MultiTaskElasticNet', (2881, 200), (13, 200)])
Skipping existing combination: dict_values(['NoImputer', 'NoImputer', 'MultiTaskElasticNet_tuned', (2881, 200), (13, 200)])
Skipping existing combination: dict_values(['NoImputer', 'NoImputer', 'MultiTaskLasso', (2881, 200), (13, 200)])
Skipping existing combination: dict_values(['NoImputer', 'NoImputer', 'MultiTaskLasso_tuned', (2881, 200), (13, 200)])
Skipping existing combination: dict_values(['NoImputer', 'NoImputer', 'RandomForestRegressor', (2881, 200), (13, 200)])
Skipping existing combination: dict_values(['NoImputer', 'NoImputer', 'XGBoostRegressor', (2881, 200), (13, 200)])
Skipping existing combination: dict_values(['NoImputer', 'NoImputer', 'XGBoostRegressor_tuned', (2881, 200), (13, 200)])
Skipping existing combination: dict_values(['NoImputer'

In [26]:
all_dict_results = clean_dict_list(all_dict_results)

Removed due to None value. Other values: {'params': {'ordinal_imputer': 'SimpleImputer_most_frequent', 'continuous_imputer': 'KNNImputer', 'model': 'XGBoostRegressor', 'train_shape': (2881, 348), 'test_shape': (13, 348)}}
Removed due to None value. Other values: {'params': {'ordinal_imputer': 'SimpleImputer_most_frequent', 'continuous_imputer': 'KNNImputer', 'model': 'XGBoostRegressor_tuned', 'train_shape': (2881, 348), 'test_shape': (13, 348)}}
Removed due to None value. Other values: {'params': {'ordinal_imputer': 'SimpleImputer_most_frequent', 'continuous_imputer': 'KNNImputer', 'model': 'GatedAdditiveTreeEnsembleConfig_tab', 'train_shape': (2881, 348), 'test_shape': (13, 348)}}
Removed due to None value. Other values: {'params': {'ordinal_imputer': 'SimpleImputer_most_frequent', 'continuous_imputer': 'KNNImputer', 'model': 'DANetConfig_tab', 'train_shape': (2881, 348), 'test_shape': (13, 348)}}
Removed due to None value. Other values: {'params': {'ordinal_imputer': 'SimpleImputer_m

In [27]:
predictive_models_list += [
    ("GatedAdditiveTreeEnsembleConfig_tab", 
    TabularModelWrapper(
        GatedAdditiveTreeEnsembleConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        gflu_stages=6,
        gflu_dropout=0.0,
        tree_depth=5,
        num_trees=20,
        chain_trees=False,
        share_head_weights=True), data_config, trainer_config, optimizer_config 
    )),
    ("DANetConfig_tab",
    TabularModelWrapper(
        DANetConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        n_layers=8,
        k=5,
        dropout_rate=0.1), data_config, trainer_config, optimizer_config
    )),
    ("TabTransformerConfig_tab",
        TabularModelWrapper(
        TabTransformerConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        embedding_initialization="kaiming_uniform",
        embedding_bias=False), data_config, trainer_config, optimizer_config
    )),
    # ("FTTransformerConfig",
    #     TabularModelWrapper(
    #     FTTransformerConfig(
    #     task="regression",
    #     head="LinearHead",
    #     head_config=head_config), data_config, trainer_config, optimizer_config
    # )),
    ("TabNetModelConfig_tab",
        TabularModelWrapper(
        TabNetModelConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        n_d=8,
        n_a=8,
        n_steps=3,
        gamma=1.3,
        n_independent=2,
        n_shared=2), data_config, trainer_config, optimizer_config
    )),
]

# Generate all combinations
combinations = list(product(continuous_imputer_list, ordinal_imputer_list, predictive_models_list))

for continuous_imputer, ordinal_imputer, model in combinations:
    name_continuous_imputer, continuous_imputer_instance = continuous_imputer
    name_ordinal_imputer, ordinal_imputer_instance = ordinal_imputer
    name_model, model_instance = model

    params = {
        "ordinal_imputer": name_ordinal_imputer, 
        "continuous_imputer": name_continuous_imputer, 
        "model": name_model, "train_shape" : df_X_train.shape, 
        "test_shape": df_X_test.shape
    }

    if any(result['params'] == params for result in all_dict_results):
        # Skip this iteration if the combination exists
        print(f"Skipping existing combination: {params.values()}")
        
        continue

    try: 
        print(name_model)
        
        # Now you can call your `train_model` function with these components
        dict_results = train_imputer_model(
            df_X_train, df_X_test, df_y_train, df_y_test,
            c_train, c_test,
            ordinal_imputer_instance, name_ordinal_imputer,
            continuous_imputer_instance, name_continuous_imputer,
            model_instance, name_model,
            separate_imputers=True  # Or however you want to specify
        )

    except Exception as e:  

        print(e)
    
        dict_results = {
        "params": params, 
        "imputation_time": None,
        "fitting_time": None, 
        "results_adj": None, 
        "results_org": None
    }
        
    # Optionally keep the all_dict_results list updated
    all_dict_results.append(dict_results)

        # Save the updated results back to the pickle file
    with open(results_file, 'wb') as f:
        pickle.dump(all_dict_results, f)

LinearRegression
Using separate imputers for ordinal and continuous data.
No NaN in train data -> Keep as it is. 
No NaN in test data -> Keep as it is. 
MultiTaskElasticNet
Using separate imputers for ordinal and continuous data.
No NaN in train data -> Keep as it is. 
No NaN in test data -> Keep as it is. 
MultiTaskElasticNet_tuned
Using separate imputers for ordinal and continuous data.
No NaN in train data -> Keep as it is. 
No NaN in test data -> Keep as it is. 
MultiTaskLasso
Using separate imputers for ordinal and continuous data.
No NaN in train data -> Keep as it is. 
No NaN in test data -> Keep as it is. 
MultiTaskLasso_tuned
Using separate imputers for ordinal and continuous data.
No NaN in train data -> Keep as it is. 
No NaN in test data -> Keep as it is. 
RandomForestRegressor
Using separate imputers for ordinal and continuous data.
No NaN in train data -> Keep as it is. 
No NaN in test data -> Keep as it is. 
XGBoostRegressor
Using separate imputers for ordinal and contin

Seed set to 42


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.
LR finder stopped early after 3 steps due to diverging loss.
Failed to compute suggestion for learning rate because there are not enough points. Increase the loop iteration limits or the size of your dataset/dataloader.
Restoring states from the checkpoint path at /home/cschneuwly/Documents/projects/optimus/notebooks/.lr_find_876daee4-7a82-43e9-bdf1-09457822dd6f.ckpt
Restored all states from the checkpoint at /home/cschneuwly/Documents/projects/optimus/notebooks/.lr_find_876daee4-7a82-43e9-bdf1-09457822dd6f.ckpt
Failed to compute suggestion for learning rate because there are not enough points. Increase the loop iteration limits or the size of your dataset/dataloader.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name             | Type                       | Params | Mode 
------------------------------------------------------------------------
0 | _backbone        | GatedAdditiveTreesBackbone | 2.1 M  | train
1 | _embedding_layer | Embedding1dLayer           | 400    | train
2 | _head            | CustomHead                 | 156    | train
3 | loss             | MSELoss                    | 0      | train
------------------------------------------------------------------------
2.1 M     Trainable params
0         Non-trainable params
2.1 M     Total params
8.417     Total estimated model params size (MB)
689       Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL omegaconf.dictconfig.DictConfig was not an allowed global by default. Please use `torch.serialization.add_safe_globals([omegaconf.dictconfig.DictConfig])` or the `torch.serialization.safe_globals([omegaconf.dictconfig.DictConfig])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about 

Seed set to 42


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.
LR finder stopped early after 3 steps due to diverging loss.
Failed to compute suggestion for learning rate because there are not enough points. Increase the loop iteration limits or the size of your dataset/dataloader.
Restoring states from the checkpoint path at /home/cschneuwly/Documents/projects/optimus/notebooks/.lr_find_c560011d-ab92-480d-9966-d1c4118aa1cf.ckpt
Restored all states from the checkpoint at /home/cschneuwly/Documents/projects/optimus/notebooks/.lr_find_c560011d-ab92-480d-9966-d1c4118aa1cf.ckpt
Failed to compute suggestion for learning rate because there are not enough points. Increase the loop iteration limits or the size of your dataset/dataloader.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name             | Type             | Params | Mode 
--------------------------------------------------------------
0 | _backbone        | DANetBackbone    | 1.4 M  | train
1 | _embedding_layer | Embedding1dLayer | 400    | train
2 | _head            | LinearHead       | 260    | train
3 | loss             | MSELoss          | 0      | train
--------------------------------------------------------------
1.4 M     Trainable params
0         Non-trainable params
1.4 M     Total params
5.787     Total estimated model params size (MB)
156       Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL omegaconf.dictconfig.DictConfig was not an allowed global by default. Please use `torch.serialization.add_safe_globals([omegaconf.dictconfig.DictConfig])` or the `torch.serialization.safe_globals([omegaconf.dictconfig.DictConfig])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about 

Seed set to 42


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.
LR finder stopped early after 3 steps due to diverging loss.
Failed to compute suggestion for learning rate because there are not enough points. Increase the loop iteration limits or the size of your dataset/dataloader.
Restoring states from the checkpoint path at /home/cschneuwly/Documents/projects/optimus/notebooks/.lr_find_ad7a3db4-4aa1-4406-8b59-d38918fcbb27.ckpt
Restored all states from the checkpoint at /home/cschneuwly/Documents/projects/optimus/notebooks/.lr_find_ad7a3db4-4aa1-4406-8b59-d38918fcbb27.ckpt
Failed to compute suggestion for learning rate because there are not enough points. Increase the loop iteration limits or the size of your dataset/dataloader.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name             | Type                   | Params | Mode 
--------------------------------------------------------------------
0 | _backbone        | TabTransformerBackbone | 271 K  | train
1 | _embedding_layer | Embedding2dLayer       | 0      | train
2 | _head            | LinearHead             | 804    | train
3 | loss             | MSELoss                | 0      | train
--------------------------------------------------------------------
272 K     Trainable params
0         Non-trainable params
272 K     Total params
1.090     Total estimated model params size (MB)
119       Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL omegaconf.dictconfig.DictConfig was not an allowed global by default. Please use `torch.serialization.add_safe_globals([omegaconf.dictconfig.DictConfig])` or the `torch.serialization.safe_globals([omegaconf.dictconfig.DictConfig])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about 

Seed set to 42


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.
LR finder stopped early after 3 steps due to diverging loss.
Failed to compute suggestion for learning rate because there are not enough points. Increase the loop iteration limits or the size of your dataset/dataloader.
Restoring states from the checkpoint path at /home/cschneuwly/Documents/projects/optimus/notebooks/.lr_find_6d918c7a-39f4-4830-bad2-2d33363bf743.ckpt
Restored all states from the checkpoint at /home/cschneuwly/Documents/projects/optimus/notebooks/.lr_find_6d918c7a-39f4-4830-bad2-2d33363bf743.ckpt
Failed to compute suggestion for learning rate because there are not enough points. Increase the loop iteration limits or the size of your dataset/dataloader.


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name             | Type           | Params | Mode 
------------------------------------------------------------
0 | _embedding_layer | Identity       | 0      | train
1 | _backbone        | TabNetBackbone | 18.9 K | train
2 | _head            | Identity       | 0      | train
3 | loss             | MSELoss        | 0      | train
------------------------------------------------------------
18.9 K    Trainable params
0         Non-trainable params
18.9 K    Total params
0.075     Total estimated model params size (MB)
107       Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL omegaconf.dictconfig.DictConfig was not an allowed global by default. Please use `torch.serialization.add_safe_globals([omegaconf.dictconfig.DictConfig])` or the `torch.serialization.safe_globals([omegaconf.dictconfig.DictConfig])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about 

In [28]:
all_dict_results

[{'params': {'ordinal_imputer': 'SimpleImputer_most_frequent',
   'continuous_imputer': 'KNNImputer',
   'model': 'LinearRegression',
   'train_shape': (2881, 348),
   'test_shape': (13, 348)},
  'imputation_time': 3.2466392517089844,
  'fitting_time': 0.22939062118530273,
  'results_adj': {'mse_score': array([0.65983176, 0.48974311, 0.40951464, 0.56684282]),
   'mae_score': array([0.63965764, 0.55144938, 0.56535002, 0.59999667]),
   'r2': array([0.33312259, 0.44426866, 0.144955  , 0.31048062]),
   'explained_variance': array([0.36730884, 0.47760179, 0.14545957, 0.43985495]),
   'corr': array([0.60633336, 0.69220668, 0.4020632 , 0.66605104])},
  'results_org': {'mse_score': array([0.65983176, 0.4897431 , 0.40951466, 0.56684279]),
   'mae_score': array([0.63965764, 0.55144938, 0.56535003, 0.59999666]),
   'r2': array([0.29810904, 0.45135258, 0.20239561, 0.33725209]),
   'explained_variance': array([0.33409019, 0.48426081, 0.20286628, 0.46160331]),
   'corr': array([0.57814397, 0.6981620

In [36]:
pd.DataFrame(all_dict_results)

Unnamed: 0,params,imputation_time,fitting_time,results_adj,results_org
0,{'ordinal_imputer': 'SimpleImputer_most_freque...,3.246639,0.229391,"{'mse_score': [0.659831758923781, 0.4897431123...","{'mse_score': [0.6598317562332855, 0.489743104..."
1,{'ordinal_imputer': 'SimpleImputer_most_freque...,3.283444,0.098224,"{'mse_score': [1.1801224802168766, 0.950415349...","{'mse_score': [1.1801224892206006, 0.950415348..."
2,{'ordinal_imputer': 'SimpleImputer_most_freque...,3.266497,1.492615,"{'mse_score': [0.6679134194781107, 0.478566173...","{'mse_score': [0.6679134155298025, 0.478566167..."
3,{'ordinal_imputer': 'SimpleImputer_most_freque...,3.283154,0.077769,"{'mse_score': [1.3936376654364884, 1.040563386...","{'mse_score': [1.39363767778841, 1.04056338628..."
4,{'ordinal_imputer': 'SimpleImputer_most_freque...,3.311443,1.492246,"{'mse_score': [0.6555347035833681, 0.476021533...","{'mse_score': [0.655534699980677, 0.4760215269..."
5,{'ordinal_imputer': 'SimpleImputer_most_freque...,3.28933,51.797212,"{'mse_score': [0.8120496178211589, 0.557388719...","{'mse_score': [0.8120496171306019, 0.557388722..."
6,{'ordinal_imputer': 'SimpleImputer_most_freque...,3.3004,0.150168,"{'mse_score': [0.9928374904980006, 0.683205912...","{'mse_score': [0.9928374937104268, 0.683205911..."
7,{'ordinal_imputer': 'SimpleImputer_most_freque...,3.375947,13.772108,"{'mse_score': [0.6182489690113672, 0.611846183...","{'mse_score': [0.6182489748896371, 0.611846177..."
8,{'ordinal_imputer': 'SimpleImputer_most_freque...,3.454399,13.395208,"{'mse_score': [1.0761740236083321, 1.312529050...","{'mse_score': [1.076174034182812, 1.3125290559..."
9,"{'ordinal_imputer': 'NoImputer', 'continuous_i...",,0.090095,"{'mse_score': [1.1119234898099377, 0.676686264...","{'mse_score': [1.111923487408025, 0.6766862634..."


In [None]:
all

In [29]:
def generate_metric_table(
    results_list,
    targets,
    metric_name,
    source="Adjusted",
    float_format="%.3f",
    csv_filename=None,
    sort_order="ascending"  # or "descending"
):
    """
    Create a LaTeX table for a single metric across targets, models, and imputers.
    Optionally export the same table as CSV and sort by mean performance.

    Parameters
    ----------
    results_list : list of dict
        List of experiment results.
    targets : list of str
        Target names (e.g., ['ADNI_MEM', 'ADNI_EF', 'ADNI_VS', 'ADNI_LAN']).
    metric_name : str
        Metric to extract (e.g., 'mae_score').
    source : str
        'Adjusted' or 'Original'.
    float_format : str
        Format for floats (e.g., '%.3f').
    csv_filename : str or None
        If provided, saves the table to CSV.
    sort_order : str
        'ascending' or 'descending' for sorting by mean.

    Returns
    -------
    str
        LaTeX-formatted table string.
    """
    rows = []
    version_key = "results_adj" if source.lower() == "adjusted" else "results_org"

    for res in results_list:
        result_block = res.get(version_key)
        if result_block is None:
            continue

        metric_values = result_block.get(metric_name)
        if metric_values is None:
            continue

        if len(metric_values) != len(targets):
            continue

        ordinal_imputer = res["params"].get("ordinal_imputer")
        model = res["params"].get("model")

        values = np.array(metric_values, dtype=np.float64)
        mean_val = np.mean(values)
        std_val = np.std(values)

        row = {
            "Ordinal Imputer": ordinal_imputer,
            "Model": model,
            "Mean": mean_val,  # for sorting
            "Mean ± SD": f"{mean_val:.3f} ± {std_val:.3f}",
        }
        row.update({target: val for target, val in zip(targets, values)})
        rows.append(row)

    df = pd.DataFrame(rows)

    # Reorder columns for display
    display_cols = ["Ordinal Imputer", "Model"] + targets + ["Mean ± SD"]
    df = df.sort_values(by="Mean", ascending=(sort_order == "ascending"))
    df = df[display_cols]

    # Save CSV
    if csv_filename:
        df.to_csv(csv_filename, index=False)

    # LaTeX output
    latex_table = df.to_latex(
        index=False,
        escape=False,
        float_format=float_format,
        caption=f"{metric_name.replace('_', ' ').upper()} across targets",
        label=f"tab:{metric_name}",
        longtable=False
    )

    return df, latex_table


In [30]:
latex_df, latex_mae = generate_metric_table(
    results_list=all_dict_results,
    targets=['ADNI_MEM', 'ADNI_EF', 'ADNI_VS', 'ADNI_LAN'],
    metric_name='corr',
    source="Adjusted",
    csv_filename="../tables/2_training_train_test_corr_adjusted_sorted.csv",
    sort_order="descending"
)
print(latex_mae)

\begin{table}
\caption{CORR across targets}
\label{tab:corr}
\begin{tabular}{llrrrrl}
\toprule
Ordinal Imputer & Model & ADNI_MEM & ADNI_EF & ADNI_VS & ADNI_LAN & Mean ± SD \\
\midrule
SimpleImputer_most_frequent & TabNetRegressor_default & 0.773 & 0.676 & 0.581 & 0.819 & 0.712 ± 0.092 \\
SimpleImputer_most_frequent & MultiTaskLasso_tuned & 0.619 & 0.707 & 0.377 & 0.675 & 0.594 ± 0.130 \\
SimpleImputer_most_frequent & LinearRegression & 0.606 & 0.692 & 0.402 & 0.666 & 0.592 ± 0.114 \\
SimpleImputer_most_frequent & MultiTaskElasticNet_tuned & 0.608 & 0.705 & 0.382 & 0.669 & 0.591 ± 0.126 \\
SimpleImputer_most_frequent & RandomForestRegressor & 0.635 & 0.677 & 0.343 & 0.627 & 0.570 ± 0.133 \\
NoImputer & XGBoostRegressor_tuned & 0.481 & 0.591 & 0.307 & 0.656 & 0.509 ± 0.132 \\
NoImputer & RandomForestRegressor & 0.540 & 0.644 & 0.217 & 0.610 & 0.503 ± 0.169 \\
NoImputer & XGBoostRegressor & 0.522 & 0.698 & 0.212 & 0.566 & 0.500 ± 0.178 \\
NoImputer & TabNetRegressor_custom & 0.553 & 0.58

In [31]:
latex_df, latex_mae = generate_metric_table(
    results_list=all_dict_results,
    targets=['ADNI_MEM', 'ADNI_EF', 'ADNI_VS', 'ADNI_LAN'],
    metric_name='r2',
    source="Adjusted",
    csv_filename="../tables/2_training_train_test_r2_adjusted_sorted.csv",
    sort_order="descending"
)
print(latex_mae)

\begin{table}
\caption{R2 across targets}
\label{tab:r2}
\begin{tabular}{llrrrrl}
\toprule
Ordinal Imputer & Model & ADNI_MEM & ADNI_EF & ADNI_VS & ADNI_LAN & Mean ± SD \\
\midrule
SimpleImputer_most_frequent & LinearRegression & 0.333 & 0.444 & 0.145 & 0.310 & 0.308 ± 0.107 \\
SimpleImputer_most_frequent & MultiTaskLasso_tuned & 0.337 & 0.460 & 0.125 & 0.291 & 0.303 ± 0.120 \\
SimpleImputer_most_frequent & TabNetRegressor_default & 0.375 & 0.306 & 0.230 & 0.297 & 0.302 ± 0.051 \\
SimpleImputer_most_frequent & MultiTaskElasticNet_tuned & 0.325 & 0.457 & 0.135 & 0.279 & 0.299 ± 0.115 \\
SimpleImputer_most_frequent & RandomForestRegressor & 0.179 & 0.368 & 0.114 & 0.072 & 0.183 ± 0.113 \\
NoImputer & XGBoostRegressor_tuned & -0.026 & 0.275 & 0.032 & 0.139 & 0.105 ± 0.115 \\
NoImputer & XGBoostRegressor & -0.008 & 0.373 & -0.040 & 0.092 & 0.104 ± 0.162 \\
NoImputer & RandomForestRegressor & -0.017 & 0.262 & 0.001 & -0.000 & 0.061 ± 0.116 \\
SimpleImputer_most_frequent & PLSRegression_4_co

In [32]:
latex_df, latex_mae = generate_metric_table(
    results_list=all_dict_results,
    targets=['ADNI_MEM', 'ADNI_EF', 'ADNI_VS', 'ADNI_LAN'],
    metric_name='mse_score',
    source="Adjusted",
    csv_filename="../tables/2_training_train_test_mse_adjusted_sorted.csv",
    sort_order="ascending"
)
print(latex_mae)

\begin{table}
\caption{MSE SCORE across targets}
\label{tab:mse_score}
\begin{tabular}{llrrrrl}
\toprule
Ordinal Imputer & Model & ADNI_MEM & ADNI_EF & ADNI_VS & ADNI_LAN & Mean ± SD \\
\midrule
SimpleImputer_most_frequent & LinearRegression & 0.660 & 0.490 & 0.410 & 0.567 & 0.531 ± 0.093 \\
SimpleImputer_most_frequent & MultiTaskLasso_tuned & 0.656 & 0.476 & 0.419 & 0.582 & 0.533 ± 0.092 \\
SimpleImputer_most_frequent & MultiTaskElasticNet_tuned & 0.668 & 0.479 & 0.414 & 0.593 & 0.538 ± 0.098 \\
SimpleImputer_most_frequent & TabNetRegressor_default & 0.618 & 0.612 & 0.369 & 0.578 & 0.544 ± 0.102 \\
SimpleImputer_most_frequent & RandomForestRegressor & 0.812 & 0.557 & 0.424 & 0.763 & 0.639 ± 0.156 \\
NoImputer & XGBoostRegressor & 0.997 & 0.553 & 0.498 & 0.746 & 0.699 ± 0.196 \\
NoImputer & XGBoostRegressor_tuned & 1.015 & 0.639 & 0.463 & 0.707 & 0.706 ± 0.199 \\
NoImputer & RandomForestRegressor & 1.006 & 0.650 & 0.479 & 0.822 & 0.739 ± 0.196 \\
SimpleImputer_most_frequent & PLSRegres

In [33]:
latex_df, latex_mae = generate_metric_table(
    results_list=all_dict_results,
    targets=['ADNI_MEM', 'ADNI_EF', 'ADNI_VS', 'ADNI_LAN'],
    metric_name='mae_score',
    source="Adjusted",
    csv_filename="../tables/2_training_train_test_mae_adjusted_sorted.csv",
    sort_order="ascending"
)
print(latex_mae)

\begin{table}
\caption{MAE SCORE across targets}
\label{tab:mae_score}
\begin{tabular}{llrrrrl}
\toprule
Ordinal Imputer & Model & ADNI_MEM & ADNI_EF & ADNI_VS & ADNI_LAN & Mean ± SD \\
\midrule
SimpleImputer_most_frequent & MultiTaskLasso_tuned & 0.622 & 0.551 & 0.578 & 0.595 & 0.586 ± 0.026 \\
SimpleImputer_most_frequent & LinearRegression & 0.640 & 0.551 & 0.565 & 0.600 & 0.589 ± 0.034 \\
SimpleImputer_most_frequent & MultiTaskElasticNet_tuned & 0.630 & 0.554 & 0.580 & 0.600 & 0.591 ± 0.028 \\
SimpleImputer_most_frequent & TabNetRegressor_default & 0.691 & 0.608 & 0.536 & 0.595 & 0.607 ± 0.055 \\
SimpleImputer_most_frequent & RandomForestRegressor & 0.669 & 0.640 & 0.608 & 0.648 & 0.641 ± 0.022 \\
NoImputer & XGBoostRegressor & 0.754 & 0.635 & 0.643 & 0.644 & 0.669 ± 0.049 \\
NoImputer & XGBoostRegressor_tuned & 0.735 & 0.715 & 0.630 & 0.613 & 0.673 ± 0.052 \\
NoImputer & RandomForestRegressor & 0.755 & 0.684 & 0.635 & 0.632 & 0.677 ± 0.050 \\
NoImputer & MultiTaskLasso_tuned & 0.72

In [34]:
latex_df.Model.value_counts()

Model
MultiTaskLasso_tuned          2
LinearRegression              2
MultiTaskElasticNet_tuned     2
TabNetRegressor_default       2
RandomForestRegressor         2
PLSRegression_4_components    2
TabNetRegressor_custom        2
MultiTaskElasticNet           2
MultiTaskLasso                2
XGBoostRegressor_tuned        1
XGBoostRegressor              1
Name: count, dtype: int64