In [2]:
# Basic imports
import os
import sys
import time
import timeit
import random
import pickle
import re
from itertools import product
import warnings

# System path modification
sys.path.insert(0, '..')

# Data handling
import pandas as pd
import numpy as np

# Machine learning imports
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer, MissingIndicator
from sklearn.model_selection import (
    train_test_split, KFold, StratifiedKFold, GroupKFold, StratifiedGroupKFold, LeaveOneOut, cross_validate, cross_val_score
)
from sklearn.linear_model import (
    LinearRegression, Lasso, LassoCV, MultiTaskLasso, MultiTaskLassoCV,
    ElasticNet, ElasticNetCV, MultiTaskElasticNet, MultiTaskElasticNetCV
)
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score

from sklearn.cross_decomposition import PLSRegression
from sklearn.inspection import permutation_importance

# Statistic imports 
from scipy.stats import ks_2samp
from scipy.special import kl_div
from scipy.stats import pearsonr
from scipy.cluster.hierarchy import linkage, leaves_list
from scipy.spatial.distance import pdist, squareform

# Specialized imputation and visualization packages
import miceforest as mf
import missingno as msno
#from missforest import MissForest
#import magic
from src.gain import *

# Custom modules
from src.train import *
from src.functions import *
from src.plots import *
from src.dataset import *
from src.multixgboost import *
from src.wrapper import *
from src.debug import *

# Visualizatiokn 
import matplotlib.pyplot as plt
import seaborn as sns

# Deep learning and machine learning specific 
import torch
from pytorch_tabnet.tab_model import TabNetRegressor
import xgboost as xgb
import shap

from pytorch_tabular.config import DataConfig, TrainerConfig, OptimizerConfig
from pytorch_tabular.models.common.heads import LinearHeadConfig
from pytorch_tabular.models import (
    GatedAdditiveTreeEnsembleConfig,
    DANetConfig,
    TabTransformerConfig,
    TabNetModelConfig,
)

# Ignore warnings
warnings.filterwarnings("ignore")

# Print CUDA availability for PyTorch
print(torch.cuda.is_available())
print(torch.cuda.device_count())

from omegaconf import DictConfig
torch.serialization.safe_globals([DictConfig])

True
1


<torch.serialization.safe_globals at 0x752f25d08410>

## Load data 

In [3]:
data = load_pickle_data_palettes()

results_pickle_folder = "../pickle/"

# Unpack data
df_X, df_y, df_all, df_FinalCombination = data["df_X"], data["df_y"], data["df_all"], data["df_FinalCombination"]
dict_select = data["dict_select"]

# Unpack colormaps
full_palette, gender_palette, dx_palette = data["colormaps"].values()

In [4]:
idx_train = list(df_X.isna().any(axis=1))
idx_test = list(~df_X.isna().any(axis=1))

set_intersect_rid = set(df_all[idx_train].RID).intersection(set(df_all[idx_test].RID))
intersect_rid_idx = df_all.RID.isin(set_intersect_rid)

for i, bool_test in enumerate(idx_test): 
    if intersect_rid_idx.iloc[i] & bool_test:
        idx_test[i] = False
        idx_train[i] = True

print(sum(idx_test))

print(df_all[idx_test].RID)

df_X[["APOE_epsilon2", "APOE_epsilon3", "APOE_epsilon4"]] = df_X[["APOE_epsilon2", "APOE_epsilon3", "APOE_epsilon4"]].astype("int", errors='ignore')

test_indices = [i for i, val in enumerate(idx_test) if val]

13
3609    2002
5631    4167
5662    4176
5780    4215
5950    4349
6069    4292
6077    4453
6085    4489
6224    4505
6400    4576
6429    4300
7021    2374
7192    4179
Name: RID, dtype: int64


# Leave-One-Complete-Out (LOCO-CV)

## All features

In [5]:
random_state=42

# Continuous Imputer List (list of tuples with unique strings and corresponding instances)
continuous_imputer_list = [
    ("KNNImputer_5", KNNImputer(n_neighbors=5)),
    ("IterativeImputer_Niter=1", IterativeImputer(max_iter=1, random_state=42)),
]

# Ordinal Imputer List (list of tuples with unique strings and corresponding instances)
ordinal_imputer_list = [
    ("KNNImputer1", KNNImputer(n_neighbors=1)),
]

# Predictive Models List (list of tuples with unique strings and corresponding instances)
predictive_models_list = [
    ("LinearRegression", LinearRegression()),
    ("MultiTaskElasticNet", MultiTaskElasticNet()),
    ("MultiTaskElasticNet_tuned", MultiTaskElasticNet(**{'alpha': 0.1, 'l1_ratio': 0.1})),
    ("MultiTaskLasso", MultiTaskLasso()),
    ("MultiTaskLasso_tuned", MultiTaskLasso(**{'alpha': 0.001})),
    ("RandomForestRegressor", RandomForestRegressor()),
    ("XGBoostRegressor", XGBoostRegressor()),
    ("XGBoostRegressor_tuned", XGBoostRegressor(**{'colsample_bytree': 0.8776807051588262, 'learning_rate': 0.13329520360246094, 'max_depth': 8, 'min_child_weight': 4, 'subsample': 0.5924272277627636})),
    ("TabNetRegressor_default", TabNetModelWrapper(n_a=8, n_d=8)),
    ("TabNetRegressor_custom", TabNetModelWrapper(n_a=32, n_d=32)),
    ("PLSRegression_4_components", PLSRegression(n_components=4))
]

In [6]:
ordinal_features = ['APOE_epsilon2', 'APOE_epsilon3', 'APOE_epsilon4']
continuous_features = [col for col in df_X.columns if col not in ordinal_features]

# Prepare Tabular configurations (shared for all PyTorch models)
data_config = DataConfig(
    target=df_y.columns.tolist(),
    continuous_cols=continuous_features,
    categorical_cols=ordinal_features
)
trainer_config = TrainerConfig(
    batch_size=1024, max_epochs=10, auto_lr_find=False,
    early_stopping="valid_loss", early_stopping_mode="min", early_stopping_patience=5,
    checkpoints="valid_loss", load_best=True, progress_bar="nones",
)
optimizer_config = OptimizerConfig()
head_config = LinearHeadConfig(dropout=0.1).__dict__

predictive_models_list += [
    ("GatedAdditiveTreeEnsembleConfig_tab", 
    TabularModelWrapper(
        GatedAdditiveTreeEnsembleConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        gflu_stages=6,
        gflu_dropout=0.0,
        tree_depth=5,
        num_trees=20,
        chain_trees=False,
        share_head_weights=True), data_config, trainer_config, optimizer_config 
    )),
    ("DANetConfig_tab",
    TabularModelWrapper(
        DANetConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        n_layers=8,
        k=5,
        dropout_rate=0.1), data_config, trainer_config, optimizer_config
    )),
    ("TabTransformerConfig_tab",
        TabularModelWrapper(
        TabTransformerConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        embedding_initialization="kaiming_uniform",
        embedding_bias=False), data_config, trainer_config, optimizer_config
    )),
    ("TabNetModelConfig_tab",
        TabularModelWrapper(
        TabNetModelConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        n_d=8,
        n_a=8,
        n_steps=3,
        gamma=1.3,
        n_independent=2,
        n_shared=2), data_config, trainer_config, optimizer_config
    )),
]

In [7]:
ordinal_features = ['APOE_epsilon2', 'APOE_epsilon3', 'APOE_epsilon4']
continuous_features = [col for col in df_X.columns if col not in ordinal_features]

# Prepare Tabular configurations (shared for all PyTorch models)
data_config = DataConfig(
    target=df_y.columns.tolist(),
    continuous_cols=continuous_features,
    categorical_cols=ordinal_features
)
trainer_config = TrainerConfig(
    batch_size=1024, max_epochs=250, auto_lr_find=False,
    early_stopping="valid_loss", early_stopping_mode="min", early_stopping_patience=5,
    checkpoints="valid_loss", load_best=True, progress_bar="nones",
)
optimizer_config = OptimizerConfig()
head_config = LinearHeadConfig(dropout=0.1).__dict__


predictive_models_list += [
    ("GatedAdditiveTreeEnsembleConfig_tab_epochs", 
    TabularModelWrapper(
        GatedAdditiveTreeEnsembleConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        gflu_stages=6,
        gflu_dropout=0.0,
        tree_depth=5,
        num_trees=20,
        chain_trees=False,
        share_head_weights=True), data_config, trainer_config, optimizer_config 
    )),
    ("DANetConfig_tab_epochs",
    TabularModelWrapper(
        DANetConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        n_layers=8,
        k=5,
        dropout_rate=0.1), data_config, trainer_config, optimizer_config
    )),
    ("TabTransformerConfig_tab_epochs",
        TabularModelWrapper(
        TabTransformerConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        embedding_initialization="kaiming_uniform",
        embedding_bias=False), data_config, trainer_config, optimizer_config
    )),
    ("TabNetModelConfig_tab_epochs",
        TabularModelWrapper(
        TabNetModelConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        n_d=8,
        n_a=8,
        n_steps=3,
        gamma=1.3,
        n_independent=2,
        n_shared=2), data_config, trainer_config, optimizer_config
    )),
]

In [8]:
# Generate all combinations
combinations = list(product(continuous_imputer_list, ordinal_imputer_list, predictive_models_list))

# Display all combinations
for continuous_imputer, ordinal_imputer, model in combinations:
    print(f"Continuous Imputer: {continuous_imputer[0]}, Ordinal Imputer: {ordinal_imputer[0]}, Model: {model[0]}")

print(f"Combinations of preprocessing and models to test : {len(combinations)}")

Continuous Imputer: KNNImputer_5, Ordinal Imputer: KNNImputer1, Model: LinearRegression
Continuous Imputer: KNNImputer_5, Ordinal Imputer: KNNImputer1, Model: MultiTaskElasticNet
Continuous Imputer: KNNImputer_5, Ordinal Imputer: KNNImputer1, Model: MultiTaskElasticNet_tuned
Continuous Imputer: KNNImputer_5, Ordinal Imputer: KNNImputer1, Model: MultiTaskLasso
Continuous Imputer: KNNImputer_5, Ordinal Imputer: KNNImputer1, Model: MultiTaskLasso_tuned
Continuous Imputer: KNNImputer_5, Ordinal Imputer: KNNImputer1, Model: RandomForestRegressor
Continuous Imputer: KNNImputer_5, Ordinal Imputer: KNNImputer1, Model: XGBoostRegressor
Continuous Imputer: KNNImputer_5, Ordinal Imputer: KNNImputer1, Model: XGBoostRegressor_tuned
Continuous Imputer: KNNImputer_5, Ordinal Imputer: KNNImputer1, Model: TabNetRegressor_default
Continuous Imputer: KNNImputer_5, Ordinal Imputer: KNNImputer1, Model: TabNetRegressor_custom
Continuous Imputer: KNNImputer_5, Ordinal Imputer: KNNImputer1, Model: PLSRegressi

In [9]:
# Initialize HDF5 file
results_file = '../pickle/training_3_loonona_dict_results.pickle'

In [10]:
if os.path.exists(results_file): 
    with open(results_file, "rb") as input_file:
        all_dict_results = pickle.load(input_file)

else : 
    all_dict_results = []

In [11]:
all_dict_results = clean_dict_list(all_dict_results, remove_if_none=False, remove_key_val={"fitting_time":None})

In [12]:
for continuous_imputer, ordinal_imputer, model in combinations:
    name_continuous_imputer, continuous_imputer_instance = continuous_imputer
    name_ordinal_imputer, ordinal_imputer_instance = ordinal_imputer
    name_model, model_instance = model

    params = {
            "ordinal_imputer": name_ordinal_imputer, 
            "continuous_imputer": name_continuous_imputer, 
            "model": name_model, "train_shape" : [df_X.shape[0]-1, df_X.shape[1]],
            "test_shape": [1, df_X.shape[1]]
        }
    
    # Define the subset of keys you care about
    keys_to_check = ['ordinal_imputer', 'continuous_imputer', 'model']  # or whatever subset you want

    # Check if a result in all_dict_results has the same values for just those keys
    if any(all(result['params'].get(k) == params.get(k) for k in keys_to_check) for result in all_dict_results):
        print(f"Skipping existing combination (subset match): {[params[k] for k in keys_to_check]}")
        continue

    dict_results = {
            "params": params, 
            "imputation_time": [],
            "fitting_time": [], 
            "results_adj": [], 
            "results_org": []
        }

    for test_nloc in test_indices: 
        print(test_nloc)

        idx_train = [True for i in range(df_X.shape[0])]
        idx_test = [False for i in range(df_X.shape[0])]

        idx_test[test_nloc] = True
        idx_train[test_nloc] = False

        df_X_train = df_X.loc[idx_train]
        df_X_test = df_X.loc[idx_test]

        df_y_train = df_y.loc[idx_train]
        df_y_test = df_y.loc[idx_test]

        c_train = df_all[["AGE", "PTGENDER", "PTEDUCAT"]].iloc[idx_train]
        c_test = df_all[["AGE", "PTGENDER", "PTEDUCAT"]].iloc[idx_test]

        try: 
        
            # Now you can call your `train_model` function with these components
            fold_dict_results = train_imputer_model(
                df_X_train, df_X_test, df_y_train, df_y_test,
                c_train, c_test,
                ordinal_imputer_instance, name_ordinal_imputer,
                continuous_imputer_instance, name_continuous_imputer,
                model_instance, name_model,
                separate_imputers=True  # Or however you want to specify
            )
            
            dict_results["imputation_time"].append(fold_dict_results["imputation_time"]) 
            dict_results["fitting_time"].append(fold_dict_results["fitting_time"])  
            dict_results["results_adj"].append(fold_dict_results["results_adj"])  
            dict_results["results_org"].append(fold_dict_results["results_org"])  

        except Exception as e:  

            print(e)
            
    # Optionally keep the all_dict_results list updated
    all_dict_results.append(dict_results)

    # Save the updated results back to the pickle file
    with open(results_file, 'wb') as f:
        pickle.dump(all_dict_results, f)


Skipping existing combination (subset match): ['KNNImputer1', 'KNNImputer_5', 'LinearRegression']
Skipping existing combination (subset match): ['KNNImputer1', 'KNNImputer_5', 'MultiTaskElasticNet']
Skipping existing combination (subset match): ['KNNImputer1', 'KNNImputer_5', 'MultiTaskElasticNet_tuned']
Skipping existing combination (subset match): ['KNNImputer1', 'KNNImputer_5', 'MultiTaskLasso']
Skipping existing combination (subset match): ['KNNImputer1', 'KNNImputer_5', 'MultiTaskLasso_tuned']
Skipping existing combination (subset match): ['KNNImputer1', 'KNNImputer_5', 'RandomForestRegressor']
Skipping existing combination (subset match): ['KNNImputer1', 'KNNImputer_5', 'XGBoostRegressor']
Skipping existing combination (subset match): ['KNNImputer1', 'KNNImputer_5', 'XGBoostRegressor_tuned']
Skipping existing combination (subset match): ['KNNImputer1', 'KNNImputer_5', 'TabNetRegressor_default']
Skipping existing combination (subset match): ['KNNImputer1', 'KNNImputer_5', 'TabNetR

In [13]:
all_dict_results


[{'params': {'ordinal_imputer': 'KNNImputer',
   'continuous_imputer': 'KNNImputer_5',
   'model': 'LinearRegression',
   'train_shape': [2893, 256],
   'test_shape': [1, 256]},
  'imputation_time': [2.619654417037964,
   2.7029354572296143,
   2.6850600242614746,
   2.680514097213745,
   2.567904233932495,
   2.625931739807129,
   2.574939727783203,
   2.5893666744232178,
   2.644169569015503,
   2.7041895389556885,
   2.7017781734466553,
   2.6986489295959473,
   2.7805628776550293],
  'fitting_time': [0.28696179389953613,
   0.21365666389465332,
   0.18554425239562988,
   0.1051933765411377,
   0.11029458045959473,
   0.11472582817077637,
   0.055664777755737305,
   0.20960426330566406,
   0.1804213523864746,
   0.20218634605407715,
   0.3804609775543213,
   0.15373802185058594,
   0.21019673347473145],
  'results_adj': [{'y_pred': array([[0.15705823, 0.98176379, 0.5622339 , 0.01345371]]),
    'y_test': array([[2.12744091, 1.81874947, 0.83318666, 1.62322893]])},
   {'y_pred': array(

In [14]:
# Store data (serialize)
with open(results_file, 'wb') as handle:
    pickle.dump(all_dict_results, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [15]:
results_file

'../pickle/training_3_loonona_dict_results.pickle'

In [16]:
with open(results_file, "rb") as input_file:
    dict_results_loo_nona = pickle.load(input_file)

In [17]:
df_results_nona = pd.json_normalize(dict_results_loo_nona)

# Train only on MRI features

In [18]:
idx_train = list(df_X.isna().any(axis=1))
idx_test = list(~df_X.isna().any(axis=1))

set_intersect_rid = set(df_all[idx_train].RID).intersection(set(df_all[idx_test].RID))
intersect_rid_idx = df_all.RID.isin(set_intersect_rid)

for i, bool_test in enumerate(idx_test): 
    if intersect_rid_idx.iloc[i] & bool_test:
        idx_test[i] = False
        idx_train[i] = True

In [19]:
df_X[["APOE_epsilon2", "APOE_epsilon3", "APOE_epsilon4"]] = df_X[["APOE_epsilon2", "APOE_epsilon3", "APOE_epsilon4"]].astype("int", errors='ignore')

In [20]:
df_X_train = df_X[dict_select["MRIth"]].loc[idx_train]
df_X_test = df_X[dict_select["MRIth"]].loc[idx_test]

df_y_train = df_y.loc[idx_train]
df_y_test = df_y.loc[idx_test]

c_train = df_all[["AGE", "PTGENDER", "PTEDUCAT"]].iloc[idx_train]
c_test = df_all[["AGE", "PTGENDER", "PTEDUCAT"]].iloc[idx_test]

In [21]:
n_imputation_iter = 10

# Continuous Imputer List (list of tuples with unique strings and corresponding instances)
continuous_imputer_list = [
    ("NoImputer", KNNImputer(n_neighbors=1)),

]

# Ordinal Imputer List (list of tuples with unique strings and corresponding instances)
ordinal_imputer_list = [
    ("NoImputer", SimpleImputer(strategy="most_frequent")),
]

# Predictive Models List (list of tuples with unique strings and corresponding instances)
predictive_models_list = [
    ("LinearRegression", LinearRegression()),
    ("MultiTaskElasticNet", MultiTaskElasticNet()),
    ("MultiTaskElasticNet_tuned", MultiTaskElasticNet(**{'alpha': 0.1, 'l1_ratio': 0.1})),
    ("MultiTaskLasso", MultiTaskLasso()),
    ("MultiTaskLasso_tuned", MultiTaskLasso(**{'alpha': 0.001})),
    ("RandomForestRegressor", RandomForestRegressor()),
    ("XGBoostRegressor", XGBoostRegressor()),
    ("XGBoostRegressor_tuned", XGBoostRegressor(**{'colsample_bytree': 0.8776807051588262, 'learning_rate': 0.13329520360246094, 'max_depth': 8, 'min_child_weight': 4, 'subsample': 0.5924272277627636})),
    ("TabNetRegressor_default", TabNetModelWrapper(n_a=8, n_d=8)),
    ("TabNetRegressor_custom", TabNetModelWrapper(n_a=32, n_d=32)),
    ("PLSRegression_4_components", PLSRegression(n_components=4))
]

In [22]:
ordinal_features = ['APOE_epsilon2', 'APOE_epsilon3', 'APOE_epsilon4']
continuous_features = [col for col in df_X_train.columns if col not in ordinal_features]

# Prepare Tabular configurations (shared for all PyTorch models)
data_config = DataConfig(
    target=df_y_train.columns.tolist(),
    continuous_cols=dict_select["MRIth"],
    categorical_cols=[]
)

trainer_config = TrainerConfig(
    batch_size=1024, max_epochs=10, auto_lr_find=True,
    early_stopping="valid_loss", early_stopping_mode="min", early_stopping_patience=5,
    checkpoints="valid_loss", load_best=True, progress_bar="nones",
)

optimizer_config = OptimizerConfig()
head_config = LinearHeadConfig(dropout=0.1).__dict__
predictive_models_list += [
    ("GatedAdditiveTreeEnsembleConfig_tab", 
    TabularModelWrapper(
        GatedAdditiveTreeEnsembleConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        gflu_stages=6,
        gflu_dropout=0.0,
        tree_depth=5,
        num_trees=20,
        chain_trees=False,
        share_head_weights=True), data_config, trainer_config, optimizer_config 
    )),
    ("DANetConfig_tab",
    TabularModelWrapper(
        DANetConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        n_layers=8,
        k=5,
        dropout_rate=0.1), data_config, trainer_config, optimizer_config
    )),
    ("TabTransformerConfig_tab",
        TabularModelWrapper(
        TabTransformerConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        embedding_initialization="kaiming_uniform",
        embedding_bias=False), data_config, trainer_config, optimizer_config
    )),
    ("TabNetModelConfig_tab",
        TabularModelWrapper(
        TabNetModelConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        n_d=8,
        n_a=8,
        n_steps=3,
        gamma=1.3,
        n_independent=2,
        n_shared=2), data_config, trainer_config, optimizer_config
    )),
]


In [23]:
# Generate all combinations
combinations = list(product(continuous_imputer_list, ordinal_imputer_list, predictive_models_list))

# Display all combinations
for continuous_imputer, ordinal_imputer, model in combinations:
    print(f"Continuous Imputer: {continuous_imputer[0]}, Ordinal Imputer: {ordinal_imputer[0]}, Model: {model[0]}")

print(f"Combinations of preprocessing and models to test : {len(combinations)}")

Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: LinearRegression
Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: MultiTaskElasticNet
Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: MultiTaskElasticNet_tuned
Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: MultiTaskLasso
Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: MultiTaskLasso_tuned
Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: RandomForestRegressor
Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: XGBoostRegressor
Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: XGBoostRegressor_tuned
Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: TabNetRegressor_default
Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: TabNetRegressor_custom
Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: PLSRegression_4_components
Continuous Imputer: NoImputer, Ordinal 

In [24]:
# Initialize HDF5 file
results_file = '../pickle/training_3_loonona_dict_results.pickle'

with open('../pickle/training_3_loonona_dict_results.pickle', "rb") as input_file:
    all_dict_results = pickle.load(input_file)

In [25]:
all_dict_results = clean_dict_list(all_dict_results, remove_if_none=False, remove_key_val={"fitting_time":None})

In [26]:
for res in all_dict_results: 
    print(res["params"])

{'ordinal_imputer': 'KNNImputer', 'continuous_imputer': 'KNNImputer_5', 'model': 'LinearRegression', 'train_shape': [2893, 256], 'test_shape': [1, 256]}
{'ordinal_imputer': 'KNNImputer', 'continuous_imputer': 'KNNImputer_5', 'model': 'MultiTaskElasticNet', 'train_shape': [2893, 256], 'test_shape': [1, 256]}
{'ordinal_imputer': 'KNNImputer', 'continuous_imputer': 'KNNImputer_5', 'model': 'MultiTaskElasticNet_tuned', 'train_shape': [2893, 256], 'test_shape': [1, 256]}
{'ordinal_imputer': 'KNNImputer', 'continuous_imputer': 'KNNImputer_5', 'model': 'MultiTaskLasso', 'train_shape': [2893, 256], 'test_shape': [1, 256]}
{'ordinal_imputer': 'KNNImputer', 'continuous_imputer': 'KNNImputer_5', 'model': 'MultiTaskLasso_tuned', 'train_shape': [2893, 256], 'test_shape': [1, 256]}
{'ordinal_imputer': 'KNNImputer', 'continuous_imputer': 'KNNImputer_5', 'model': 'RandomForestRegressor', 'train_shape': [2893, 256], 'test_shape': [1, 256]}
{'ordinal_imputer': 'KNNImputer', 'continuous_imputer': 'KNNImp

In [27]:
from src.debug import *

rm_combinations = [
]
if False : 
    for par in rm_combinations:
        all_dict_results = clean_dict_list(all_dict_results, remove_if_none=False, remove_key_val={"params":par})

In [28]:
for continuous_imputer, ordinal_imputer, model in combinations:
    name_continuous_imputer, continuous_imputer_instance = continuous_imputer
    name_ordinal_imputer, ordinal_imputer_instance = ordinal_imputer
    name_model, model_instance = model

    params = {
            "ordinal_imputer": name_ordinal_imputer, 
            "continuous_imputer": name_continuous_imputer, 
            "model": name_model, "train_shape" : [df_X.shape[0]-1, df_X.shape[1]],
            "test_shape": [1, df_X.shape[1]]
        }
    
    # Define the subset of keys you care about
    keys_to_check = ['ordinal_imputer', 'continuous_imputer', 'model']  # or whatever subset you want

    # Check if a result in all_dict_results has the same values for just those keys
    if any(all(result['params'].get(k) == params.get(k) for k in keys_to_check) for result in all_dict_results):
        print(f"Skipping existing combination (subset match): {[params[k] for k in keys_to_check]}")
        continue

    dict_results = {
            "params": params, 
            "imputation_time": [],
            "fitting_time": [], 
            "results_adj": [], 
            "results_org": []
        }

    for test_nloc in test_indices: 
        print(test_nloc)

        idx_train = [True for i in range(df_X.shape[0])]
        idx_test = [False for i in range(df_X.shape[0])]

        idx_test[test_nloc] = True
        idx_train[test_nloc] = False

        df_X_train = df_X[dict_select["MRIth"]].loc[idx_train]
        df_X_test = df_X[dict_select["MRIth"]].loc[idx_test]

        df_y_train = df_y.loc[idx_train]
        df_y_test = df_y.loc[idx_test]

        c_train = df_all[["AGE", "PTGENDER", "PTEDUCAT"]].iloc[idx_train]
        c_test = df_all[["AGE", "PTGENDER", "PTEDUCAT"]].iloc[idx_test]

        try: 
        
            # Now you can call your `train_model` function with these components
            fold_dict_results = train_imputer_model(
                df_X_train, df_X_test, df_y_train, df_y_test,
                c_train, c_test,
                ordinal_imputer_instance, name_ordinal_imputer,
                continuous_imputer_instance, name_continuous_imputer,
                model_instance, name_model,
                separate_imputers=True  # Or however you want to specify
            )
            
            dict_results["imputation_time"].append(fold_dict_results["imputation_time"]) 
            dict_results["fitting_time"].append(fold_dict_results["fitting_time"])  
            dict_results["results_adj"].append(fold_dict_results["results_adj"])  
            dict_results["results_org"].append(fold_dict_results["results_org"])  

        except Exception as e:  

            print(e)
            
    # Optionally keep the all_dict_results list updated
    all_dict_results.append(dict_results)

    # Save the updated results back to the pickle file
    with open(results_file, 'wb') as f:
        pickle.dump(all_dict_results, f)


Skipping existing combination (subset match): ['NoImputer', 'NoImputer', 'LinearRegression']
Skipping existing combination (subset match): ['NoImputer', 'NoImputer', 'MultiTaskElasticNet']
Skipping existing combination (subset match): ['NoImputer', 'NoImputer', 'MultiTaskElasticNet_tuned']
Skipping existing combination (subset match): ['NoImputer', 'NoImputer', 'MultiTaskLasso']
Skipping existing combination (subset match): ['NoImputer', 'NoImputer', 'MultiTaskLasso_tuned']
Skipping existing combination (subset match): ['NoImputer', 'NoImputer', 'RandomForestRegressor']
Skipping existing combination (subset match): ['NoImputer', 'NoImputer', 'XGBoostRegressor']
Skipping existing combination (subset match): ['NoImputer', 'NoImputer', 'XGBoostRegressor_tuned']
Skipping existing combination (subset match): ['NoImputer', 'NoImputer', 'TabNetRegressor_default']
Skipping existing combination (subset match): ['NoImputer', 'NoImputer', 'TabNetRegressor_custom']
Skipping existing combination (s

In [29]:
# Store data (serialize)
with open(results_file, 'wb') as handle:
    pickle.dump(all_dict_results, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [30]:
with open(results_file, "rb") as input_file:
    all_dict_results = pickle.load(input_file)

# Print Table for paper

In [31]:
with open('../pickle/training_3_loonona_dict_results.pickle', "rb") as input_file:
    dict_results_loo_nona = pickle.load(input_file)

In [32]:
def generate_metric_table(
    results_list,
    targets,
    metric_name="r2",
    source="Adjusted",
    csv_filename=None,
    sort_order="descending"
):

    # Choose the key
    key = "results_adj" if source.lower() == "adjusted" else "results_org"
    df_rows = []

    for res in results_list:
        result_blocks = res.get(key, [])
        if not isinstance(result_blocks, list) or len(result_blocks) == 0:
            continue

        # Aggregate all predictions across folds
        y_preds_all = {target: [] for target in targets}
        y_tests_all = {target: [] for target in targets}

        for fold in result_blocks:
            y_pred = fold["y_pred"]
            y_test = fold["y_test"]

            if y_pred.shape != y_test.shape:
                continue

            for i, target in enumerate(targets):
                if y_pred.shape[1] <= i:
                    continue
                y_preds_all[target].append(y_pred[0, i])
                y_tests_all[target].append(y_test[0, i])

        # Compute metric for each target
        target_metrics = []
        for target in targets:
            y_preds = y_preds_all[target]
            y_tests = y_tests_all[target]

            if len(y_preds) < 2:
                metric = float("nan")
            else:
                if metric_name == "r2":
                    metric = r2_score(y_tests, y_preds)
                elif metric_name == "mae":
                    metric = mean_absolute_error(y_tests, y_preds)
                elif metric_name == "mse":
                    metric = mean_squared_error(y_tests, y_preds)
                elif metric_name == "explained_variance":
                    metric = explained_variance_score(y_tests, y_preds)
                elif metric_name == "corr":
                    metric = pearsonr(y_tests, y_preds)[0]
                else:
                    raise ValueError(f"Unsupported metric: {metric_name}")

            target_metrics.append(metric)

        # Compute mean ± std across targets
        values = np.array(target_metrics, dtype=np.float64)
        mean_val = np.nanmean(values)
        std_val = np.nanstd(values)

        # Get times
        imp_times = np.array(res.get("imputation_time", []), dtype=np.float64)
        fit_times = np.array(res.get("fitting_time", []), dtype=np.float64)

        row = {
            "continuous_imputer": res["params"].get("continuous_imputer"),
            "ordinal_imputer": res["params"].get("ordinal_imputer"),
            "model": res["params"].get("model"),
            **{target: val for target, val in zip(targets, target_metrics)},
            "mean ± std": f"{mean_val:.3f} ± {std_val:.3f}",
            "imp_time (s)": f"{np.nanmean(imp_times):.1f} ± {np.nanstd(imp_times):.1f}",
            "fit_time (s)": f"{np.nanmean(fit_times):.1f} ± {np.nanstd(fit_times):.1f}",
            "_sort_val": mean_val  # hidden sort key
        }
        df_rows.append(row)

    df = pd.DataFrame(df_rows)

    # Sort by mean value
    df = df.sort_values(by="_sort_val", ascending=(sort_order == "ascending")).drop(columns="_sort_val")

    # Optional: Save to CSV
    if csv_filename:
        df.to_csv(csv_filename, index=False)

    # Format for LaTeX output
    latex_df = df.copy()
    for target in targets:
        latex_df[target] = latex_df[target].apply(lambda x: f"{x:.3f}" if pd.notnull(x) else "–")

    latex_output = latex_df.to_latex(index=False, escape=False)

    return df, latex_output


In [33]:
latex_df, latex_corr = generate_metric_table(
    results_list=dict_results_loo_nona,
    targets=['ADNI_MEM', 'ADNI_EF', 'ADNI_VS', 'ADNI_LAN'],
    metric_name='corr',
    source="Adjusted",
    csv_filename="../tables/3_training_loonona_corr_adjusted_sorted.csv",
    sort_order="descending"
)
print(latex_corr)

\begin{tabular}{llllllllll}
\toprule
continuous_imputer & ordinal_imputer & model & ADNI_MEM & ADNI_EF & ADNI_VS & ADNI_LAN & mean ± std & imp_time (s) & fit_time (s) \\
\midrule
KNNImputer_5 & KNNImputer1 & RandomForestRegressor & 0.725 & 0.677 & 0.407 & 0.766 & 0.644 ± 0.140 & 2.6 ± 0.0 & 40.7 ± 0.2 \\
KNNImputer_5 & KNNImputer & RandomForestRegressor & 0.678 & 0.738 & 0.358 & 0.700 & 0.619 ± 0.152 & 2.5 ± 0.0 & 39.8 ± 0.5 \\
IterativeImputer_Niter=1 & KNNImputer1 & RandomForestRegressor & 0.717 & 0.700 & 0.295 & 0.742 & 0.613 ± 0.184 & 94.2 ± 2.7 & 39.9 ± 0.4 \\
KNNImputer_5 & KNNImputer & TabNetRegressor_default & 0.678 & 0.556 & 0.489 & 0.711 & 0.609 ± 0.090 & 2.5 ± 0.0 & 12.4 ± 0.7 \\
KNNImputer_5 & KNNImputer1 & TabNetRegressor_default & 0.678 & 0.556 & 0.489 & 0.711 & 0.609 ± 0.090 & 2.6 ± 0.0 & 12.4 ± 0.6 \\
IterativeImputer_Niter=1 & KNNImputer1 & XGBoostRegressor_tuned & 0.675 & 0.688 & 0.200 & 0.722 & 0.571 ± 0.215 & 94.5 ± 1.6 & 5.1 ± 0.3 \\
IterativeImputer_Niter=1 & KNNI

In [34]:
latex_df, latex_r2 = generate_metric_table(
    results_list=dict_results_loo_nona,
    targets=['ADNI_MEM', 'ADNI_EF', 'ADNI_VS', 'ADNI_LAN'],
    metric_name='r2',
    source="Adjusted",
    csv_filename="../tables/3_training_loonona_r2_adjusted_sorted.csv",
    sort_order="descending"
)
print(latex_r2)

\begin{tabular}{llllllllll}
\toprule
continuous_imputer & ordinal_imputer & model & ADNI_MEM & ADNI_EF & ADNI_VS & ADNI_LAN & mean ± std & imp_time (s) & fit_time (s) \\
\midrule
KNNImputer_5 & KNNImputer & TabNetRegressor_default & 0.292 & 0.267 & 0.236 & 0.312 & 0.277 ± 0.028 & 2.5 ± 0.0 & 12.4 ± 0.7 \\
KNNImputer_5 & KNNImputer1 & TabNetRegressor_default & 0.292 & 0.267 & 0.236 & 0.312 & 0.277 ± 0.028 & 2.6 ± 0.0 & 12.4 ± 0.6 \\
KNNImputer_5 & KNNImputer1 & RandomForestRegressor & 0.250 & 0.362 & 0.158 & 0.279 & 0.262 ± 0.073 & 2.6 ± 0.0 & 40.7 ± 0.2 \\
IterativeImputer_Niter=1 & KNNImputer1 & XGBoostRegressor_tuned & 0.300 & 0.412 & -0.100 & 0.347 & 0.240 ± 0.200 & 94.5 ± 1.6 & 5.1 ± 0.3 \\
IterativeImputer_Niter=1 & KNNImputer1 & XGBoostRegressor & 0.279 & 0.417 & 0.062 & 0.180 & 0.235 ± 0.130 & 93.0 ± 2.1 & 1.0 ± 0.0 \\
IterativeImputer_Niter=1 & KNNImputer1 & RandomForestRegressor & 0.250 & 0.370 & 0.060 & 0.166 & 0.212 ± 0.114 & 94.2 ± 2.7 & 39.9 ± 0.4 \\
IterativeImputer_Niter

In [35]:
latex_df, latex_mae = generate_metric_table(
    results_list=dict_results_loo_nona,
    targets=['ADNI_MEM', 'ADNI_EF', 'ADNI_VS', 'ADNI_LAN'],
    metric_name='mae',
    source="Adjusted",
    csv_filename="../tables/3_training_loonona_mae_adjusted_sorted.csv",
    sort_order="ascending"
)
print(latex_mae)

\begin{tabular}{llllllllll}
\toprule
continuous_imputer & ordinal_imputer & model & ADNI_MEM & ADNI_EF & ADNI_VS & ADNI_LAN & mean ± std & imp_time (s) & fit_time (s) \\
\midrule
KNNImputer_5 & KNNImputer & TabNetRegressor_default & 0.648 & 0.624 & 0.550 & 0.592 & 0.603 ± 0.037 & 2.5 ± 0.0 & 12.4 ± 0.7 \\
KNNImputer_5 & KNNImputer1 & TabNetRegressor_default & 0.648 & 0.624 & 0.550 & 0.592 & 0.603 ± 0.037 & 2.6 ± 0.0 & 12.4 ± 0.6 \\
IterativeImputer_Niter=1 & KNNImputer1 & RandomForestRegressor & 0.611 & 0.634 & 0.630 & 0.571 & 0.611 ± 0.025 & 94.2 ± 2.7 & 39.9 ± 0.4 \\
KNNImputer_5 & KNNImputer1 & RandomForestRegressor & 0.648 & 0.626 & 0.599 & 0.592 & 0.616 ± 0.022 & 2.6 ± 0.0 & 40.7 ± 0.2 \\
IterativeImputer_Niter=1 & KNNImputer1 & XGBoostRegressor_tuned & 0.692 & 0.614 & 0.632 & 0.548 & 0.622 ± 0.051 & 94.5 ± 1.6 & 5.1 ± 0.3 \\
KNNImputer_5 & KNNImputer & RandomForestRegressor & 0.667 & 0.608 & 0.619 & 0.636 & 0.633 ± 0.022 & 2.5 ± 0.0 & 39.8 ± 0.5 \\
KNNImputer_5 & KNNImputer & XGB

In [36]:
latex_df, latex_mse = generate_metric_table(
    results_list=dict_results_loo_nona,
    targets=['ADNI_MEM', 'ADNI_EF', 'ADNI_VS', 'ADNI_LAN'],
    metric_name='mse',
    source="Adjusted",
    csv_filename="../tables/3_training_loonona_mse_adjusted_sorted.csv",
    sort_order="ascending"
)
print(latex_mse)

\begin{tabular}{llllllllll}
\toprule
continuous_imputer & ordinal_imputer & model & ADNI_MEM & ADNI_EF & ADNI_VS & ADNI_LAN & mean ± std & imp_time (s) & fit_time (s) \\
\midrule
IterativeImputer_Niter=1 & KNNImputer1 & XGBoostRegressor_tuned & 0.694 & 0.519 & 0.528 & 0.537 & 0.570 ± 0.072 & 94.5 ± 1.6 & 5.1 ± 0.3 \\
KNNImputer_5 & KNNImputer1 & TabNetRegressor_default & 0.702 & 0.647 & 0.367 & 0.566 & 0.570 ± 0.127 & 2.6 ± 0.0 & 12.4 ± 0.6 \\
KNNImputer_5 & KNNImputer & TabNetRegressor_default & 0.702 & 0.647 & 0.367 & 0.566 & 0.570 ± 0.127 & 2.5 ± 0.0 & 12.4 ± 0.7 \\
KNNImputer_5 & KNNImputer1 & RandomForestRegressor & 0.744 & 0.563 & 0.404 & 0.593 & 0.576 ± 0.121 & 2.6 ± 0.0 & 40.7 ± 0.2 \\
IterativeImputer_Niter=1 & KNNImputer1 & XGBoostRegressor & 0.715 & 0.514 & 0.450 & 0.674 & 0.588 ± 0.109 & 93.0 ± 2.1 & 1.0 ± 0.0 \\
IterativeImputer_Niter=1 & KNNImputer1 & GatedAdditiveTreeEnsembleConfig_tab & 0.687 & 0.567 & 0.484 & 0.684 & 0.606 ± 0.085 & 99.0 ± 11.4 & 16.3 ± 0.2 \\
Iterativ