In [1]:
# Basic imports
import os
import sys
import time
import pickle
from itertools import product
import warnings

# System path modification
sys.path.insert(0, '..')

# Data handling
import pandas as pd
import numpy as np

# Machine learning imports
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import (
    LinearRegression, Lasso, LassoCV, MultiTaskLasso, MultiTaskLassoCV,
    ElasticNet, ElasticNetCV, MultiTaskElasticNet, MultiTaskElasticNetCV
)
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score

from sklearn.cross_decomposition import PLSRegression
from sklearn.inspection import permutation_importance

# Custom modules
from src.train import *
from src.functions import *
from src.plots import *
from src.dataset import *
from src.multixgboost import *
from src.wrapper import *

# Visualizatiokn 
import matplotlib.pyplot as plt
import seaborn as sns

# Deep learning and machine learning specific 
import torch
from pytorch_tabnet.tab_model import TabNetRegressor
import xgboost as xgb
import shap

from pytorch_tabular.config import DataConfig, TrainerConfig, OptimizerConfig
from pytorch_tabular.models.common.heads import LinearHeadConfig

from pytorch_tabular.models import (
    GatedAdditiveTreeEnsembleConfig,
    DANetConfig,
    TabTransformerConfig,
    FTTransformerConfig,
    TabNetModelConfig,
)


# Ignore warnings
warnings.filterwarnings("ignore")

# Print CUDA availability for PyTorch
print(torch.cuda.is_available())
print(torch.cuda.device_count())

False
0


## Load data 

In [2]:
data = load_pickle_data_palettes()

results_pickle_folder = "../pickle/"

# Unpack data
df_X, df_y, df_all, df_FinalCombination = data["df_X"], data["df_y"], data["df_all"], data["df_FinalCombination"]
dict_select = data["dict_select"]

# Unpack colormaps
full_palette, gender_palette, dx_palette = data["colormaps"].values()

# Train-Test Split

In [3]:
idx_train = list(df_X.isna().any(axis=1))
idx_test = list(~df_X.isna().any(axis=1))

set_intersect_rid = set(df_all[idx_train].RID).intersection(set(df_all[idx_test].RID))
intersect_rid_idx = df_all.RID.isin(set_intersect_rid)

for i, bool_test in enumerate(idx_test): 
    if intersect_rid_idx.iloc[i] & bool_test:
        idx_test[i] = False
        idx_train[i] = True
        
df_X[["APOE_epsilon2", "APOE_epsilon3", "APOE_epsilon4"]] = df_X[["APOE_epsilon2", "APOE_epsilon3", "APOE_epsilon4"]].astype("category")

df_X_train = df_X.loc[idx_train]
df_X_test = df_X.loc[idx_test]

df_y_train = df_y.loc[idx_train]
df_y_test = df_y.loc[idx_test]

c_train = df_all[["AGE", "PTGENDER", "PTEDUCAT"]].iloc[idx_train]
c_test = df_all[["AGE", "PTGENDER", "PTEDUCAT"]].iloc[idx_test]

In [4]:
df_all.SubjectID.iloc[idx_test]

3609    128_S_2002
5631    116_S_4167
5662    033_S_4176
5780    098_S_4215
5950    018_S_4349
6069    941_S_4292
6077    116_S_4453
6085    135_S_4489
6224    033_S_4505
6400    014_S_4576
6429    073_S_4300
7021    003_S_2374
7192    033_S_4179
Name: SubjectID, dtype: object

Define all the models and combinations to try out with their hyperparameters. 

In [5]:
random_state=42
n_imputation_iter = 10

# Define hyperparameters
gain_parameters = {
    'hint_rate': 0.9,
    'alpha': 100,
    'iterations': 1000
}

# Continuous Imputer List (list of tuples with unique strings and corresponding instances)
continuous_imputer_list = [
    ("KNNImputer", KNNImputer(n_neighbors=1)),
    ("KNNImputer_2", KNNImputer(n_neighbors=2)),
]

# Ordinal Imputer List (list of tuples with unique strings and corresponding instances)
ordinal_imputer_list = [
    ("SimpleImputer_most_frequent", SimpleImputer(strategy="most_frequent")),
    ("KNNImputer", KNNImputer(n_neighbors=1)),
    ("SimpleImputer_constant", SimpleImputer(strategy="constant", fill_value=-1))
]

# Predictive Models List (list of tuples with unique strings and corresponding instances)
predictive_models_list = [
    ("LinearRegression", LinearRegression()),
    ("MultiTaskElasticNet", MultiTaskElasticNet()),
    ("MultiTaskElasticNet_tuned", MultiTaskElasticNet(**{'alpha': 0.01, 'l1_ratio': 0.01})),
    ("MultiTaskLasso", MultiTaskLasso()),
    ("MultiTaskLasso_tuned", MultiTaskLasso(**{'alpha': 0.001})),
    ("RandomForestRegressor", RandomForestRegressor()),
    ("XGBoostRegressor", XGBoostRegressor()),
    ("XGBoostRegressor_tuned", XGBoostRegressor(**{'colsample_bytree': 0.5079831261101071, 'learning_rate': 0.0769592094304232, 'max_depth': 6, 'min_child_weight': 7, 'subsample': 0.8049983288913105})),
    ("TabNetRegressor_default", TabNetModelWrapper(n_a=8, n_d=8)),
    ("TabNetRegressor_custom", TabNetModelWrapper(n_a=32, n_d=32)),
    ("PLSRegression_4_components", PLSRegression(n_components=4))
]

# Generate all combinations
combinations = list(product(continuous_imputer_list, ordinal_imputer_list, predictive_models_list))

# Display all combinations
for continuous_imputer, ordinal_imputer, model in combinations:
    print(f"Continuous Imputer: {continuous_imputer[0]}, Ordinal Imputer: {ordinal_imputer[0]}, Model: {model[0]}")

print(f"Combinations of preprocessing and models to test : {len(combinations)}")

Continuous Imputer: KNNImputer, Ordinal Imputer: SimpleImputer_most_frequent, Model: LinearRegression
Continuous Imputer: KNNImputer, Ordinal Imputer: SimpleImputer_most_frequent, Model: MultiTaskElasticNet
Continuous Imputer: KNNImputer, Ordinal Imputer: SimpleImputer_most_frequent, Model: MultiTaskElasticNet_tuned
Continuous Imputer: KNNImputer, Ordinal Imputer: SimpleImputer_most_frequent, Model: MultiTaskLasso
Continuous Imputer: KNNImputer, Ordinal Imputer: SimpleImputer_most_frequent, Model: MultiTaskLasso_tuned
Continuous Imputer: KNNImputer, Ordinal Imputer: SimpleImputer_most_frequent, Model: RandomForestRegressor
Continuous Imputer: KNNImputer, Ordinal Imputer: SimpleImputer_most_frequent, Model: XGBoostRegressor
Continuous Imputer: KNNImputer, Ordinal Imputer: SimpleImputer_most_frequent, Model: XGBoostRegressor_tuned
Continuous Imputer: KNNImputer, Ordinal Imputer: SimpleImputer_most_frequent, Model: TabNetRegressor_default
Continuous Imputer: KNNImputer, Ordinal Imputer: S

In [6]:
# Initialize HDF5 file
results_file = '../pickle/training_2_dict_results.pickle'

if os.path.exists(results_file): 

    with open(results_file, "rb") as input_file:
        all_dict_results = pickle.load(input_file)

else : 
    all_dict_results = []

In [None]:
for continuous_imputer, ordinal_imputer, model in combinations:
    name_continuous_imputer, continuous_imputer_instance = continuous_imputer
    name_ordinal_imputer, ordinal_imputer_instance = ordinal_imputer
    name_model, model_instance = model

    params = {
        "ordinal_imputer": name_ordinal_imputer, 
        "continuous_imputer": name_continuous_imputer, 
        "model": name_model, "train_shape" : df_X_train.shape, 
        "test_shape": df_X_test.shape
    }

    if any(result['params'] == params for result in all_dict_results):
        # Skip this iteration if the combination exists
        print(f"Skipping existing combination: {params.values()}")
        
        continue

    try: 
    
        # Now you can call your `train_model` function with these components
        dict_results = train_imputer_model(
            df_X_train, df_X_test, df_y_train, df_y_test,
            c_train, c_test,
            ordinal_imputer_instance, name_ordinal_imputer,
            continuous_imputer_instance, name_continuous_imputer,
            model_instance, name_model,
            separate_imputers=True  # Or however you want to specify
        )

    except Exception as e:  

        print(e)
    
        dict_results = {
        "params": params, 
        "imputation_time": None,
        "fitting_time": None, 
        "results_adj": None, 
        "results_org": None
    }
        
    # Optionally keep the all_dict_results list updated
    all_dict_results.append(dict_results)

        # Save the updated results back to the pickle file
    with open(results_file, 'wb') as f:
        pickle.dump(all_dict_results, f)


Skipping existing combination: dict_values(['SimpleImputer_most_frequent', 'KNNImputer', 'LinearRegression', (2881, 276), (13, 276)])
Skipping existing combination: dict_values(['SimpleImputer_most_frequent', 'KNNImputer', 'MultiTaskElasticNet', (2881, 276), (13, 276)])
Skipping existing combination: dict_values(['SimpleImputer_most_frequent', 'KNNImputer', 'MultiTaskElasticNet_tuned', (2881, 276), (13, 276)])
Skipping existing combination: dict_values(['SimpleImputer_most_frequent', 'KNNImputer', 'MultiTaskLasso', (2881, 276), (13, 276)])
Skipping existing combination: dict_values(['SimpleImputer_most_frequent', 'KNNImputer', 'MultiTaskLasso_tuned', (2881, 276), (13, 276)])
Skipping existing combination: dict_values(['SimpleImputer_most_frequent', 'KNNImputer', 'RandomForestRegressor', (2881, 276), (13, 276)])
Skipping existing combination: dict_values(['SimpleImputer_most_frequent', 'KNNImputer', 'XGBoostRegressor', (2881, 276), (13, 276)])
Skipping existing combination: dict_values(

In [9]:
# Store data (serialize)
with open(results_file, 'wb') as handle:
    pickle.dump(all_dict_results, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [10]:
with open('../pickle/training_2_dict_results.pickle', "rb") as input_file:
    dict_results_split = pickle.load(input_file)

In [11]:
dict_results_split

[{'params': {'ordinal_imputer': 'SimpleImputer_most_frequent',
   'continuous_imputer': 'KNNImputer',
   'model': 'LinearRegression',
   'train_shape': (2881, 276),
   'test_shape': (13, 276)},
  'imputation_time': 3.744856119155884,
  'fitting_time': 0.10365915298461914,
  'results_adj': {'mse_score': array([0.82472937, 0.40934318, 0.49734881, 0.66807641]),
   'mae_score': array([0.68760248, 0.52230642, 0.60995161, 0.64498316]),
   'r2': array([ 0.16646421,  0.53550172, -0.0384381 ,  0.18733798]),
   'explained_variance': array([ 0.25451649,  0.5631066 , -0.02406892,  0.44316425]),
   'corr': array([0.51351669, 0.76426752, 0.27239804, 0.67696079])},
  'results_org': {'mse_score': array([0.82472936, 0.40934317, 0.49734882, 0.66807639]),
   'mae_score': array([0.68760248, 0.52230642, 0.60995162, 0.64498315]),
   'r2': array([0.12270048, 0.54142269, 0.03132258, 0.21889061]),
   'explained_variance': array([0.21537583, 0.56867569, 0.04472644, 0.4647841 ]),
   'corr': array([0.48070093, 0.

## Add Pytorch Tabular models as well 

In [12]:
ordinal_features = ['APOE_epsilon2', 'APOE_epsilon3', 'APOE_epsilon4']
continuous_features = [col for col in df_X_train.columns if col not in ordinal_features]

# Prepare Tabular configurations (shared for all PyTorch models)
data_config = DataConfig(
    target=df_y_train.columns.tolist(),
    continuous_cols=continuous_features,
    categorical_cols=ordinal_features
)
trainer_config = TrainerConfig(
    batch_size=1024, max_epochs=1, auto_lr_find=True,
    early_stopping="valid_loss", early_stopping_mode="min", early_stopping_patience=5,
    checkpoints="valid_loss", load_best=True, progress_bar="nones", accelerator="cpu"
)
optimizer_config = OptimizerConfig()
head_config = LinearHeadConfig(dropout=0.1).__dict__

In [13]:
predictive_models_list += [
    ("GatedAdditiveTreeEnsembleConfig_tab", 
    TabularModelWrapper(
        GatedAdditiveTreeEnsembleConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        gflu_stages=6,
        gflu_dropout=0.0,
        tree_depth=5,
        num_trees=20,
        chain_trees=False,
        share_head_weights=True), data_config, trainer_config, optimizer_config 
    )),
    ("DANetConfig_tab",
    TabularModelWrapper(
        DANetConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        n_layers=8,
        k=5,
        dropout_rate=0.1), data_config, trainer_config, optimizer_config
    )),
    ("TabTransformerConfig_tab",
        TabularModelWrapper(
        TabTransformerConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        embedding_initialization="kaiming_uniform",
        embedding_bias=False), data_config, trainer_config, optimizer_config
    )),
    # ("FTTransformerConfig",
    #     TabularModelWrapper(
    #     FTTransformerConfig(
    #     task="regression",
    #     head="LinearHead",
    #     head_config=head_config), data_config, trainer_config, optimizer_config
    # )),
    ("TabNetModelConfig_tab",
        TabularModelWrapper(
        TabNetModelConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        n_d=8,
        n_a=8,
        n_steps=3,
        gamma=1.3,
        n_independent=2,
        n_shared=2), data_config, trainer_config, optimizer_config
    )),
]

# Generate all combinations
combinations = list(product(continuous_imputer_list, ordinal_imputer_list, predictive_models_list))

for continuous_imputer, ordinal_imputer, model in combinations:
    name_continuous_imputer, continuous_imputer_instance = continuous_imputer
    name_ordinal_imputer, ordinal_imputer_instance = ordinal_imputer
    name_model, model_instance = model

    params = {
        "ordinal_imputer": name_ordinal_imputer, 
        "continuous_imputer": name_continuous_imputer, 
        "model": name_model, "train_shape" : df_X_train.shape, 
        "test_shape": df_X_test.shape
    }

    if any(result['params'] == params for result in all_dict_results):
        # Skip this iteration if the combination exists
        print(f"Skipping existing combination: {params.values()}")
        
        continue

    try: 
        print(name_model)
        
        # Now you can call your `train_model` function with these components
        dict_results = train_imputer_model(
            df_X_train, df_X_test, df_y_train, df_y_test,
            c_train, c_test,
            ordinal_imputer_instance, name_ordinal_imputer,
            continuous_imputer_instance, name_continuous_imputer,
            model_instance, name_model,
            separate_imputers=True  # Or however you want to specify
        )

    except Exception as e:  

        print(e)
    
        dict_results = {
        "params": params, 
        "imputation_time": None,
        "fitting_time": None, 
        "results_adj": None, 
        "results_org": None
    }
        
    # Optionally keep the all_dict_results list updated
    all_dict_results.append(dict_results)

        # Save the updated results back to the pickle file
    with open(results_file, 'wb') as f:
        pickle.dump(all_dict_results, f)

Skipping existing combination: dict_values(['SimpleImputer_most_frequent', 'KNNImputer', 'LinearRegression', (2881, 276), (13, 276)])
Skipping existing combination: dict_values(['SimpleImputer_most_frequent', 'KNNImputer', 'MultiTaskElasticNet', (2881, 276), (13, 276)])
Skipping existing combination: dict_values(['SimpleImputer_most_frequent', 'KNNImputer', 'MultiTaskElasticNet_tuned', (2881, 276), (13, 276)])
Skipping existing combination: dict_values(['SimpleImputer_most_frequent', 'KNNImputer', 'MultiTaskLasso', (2881, 276), (13, 276)])
Skipping existing combination: dict_values(['SimpleImputer_most_frequent', 'KNNImputer', 'MultiTaskLasso_tuned', (2881, 276), (13, 276)])
Skipping existing combination: dict_values(['SimpleImputer_most_frequent', 'KNNImputer', 'RandomForestRegressor', (2881, 276), (13, 276)])
Skipping existing combination: dict_values(['SimpleImputer_most_frequent', 'KNNImputer', 'XGBoostRegressor', (2881, 276), (13, 276)])
Skipping existing combination: dict_values(

# Train models only on MRI features to compare performances

## Test train split

In [14]:
idx_train = list(df_X.isna().any(axis=1))
idx_test = list(~df_X.isna().any(axis=1))

set_intersect_rid = set(df_all[idx_train].RID).intersection(set(df_all[idx_test].RID))
intersect_rid_idx = df_all.RID.isin(set_intersect_rid)

for i, bool_test in enumerate(idx_test): 
    if intersect_rid_idx.iloc[i] & bool_test:
        idx_test[i] = False
        idx_train[i] = True

In [15]:
df_X[["APOE_epsilon2", "APOE_epsilon3", "APOE_epsilon4"]] = df_X[["APOE_epsilon2", "APOE_epsilon3", "APOE_epsilon4"]].astype("category")

In [16]:
df_X_train = df_X[dict_select["MRIth"]].loc[idx_train]
df_X_test = df_X[dict_select["MRIth"]].loc[idx_test]

df_y_train = df_y.loc[idx_train]
df_y_test = df_y.loc[idx_test]

c_train = df_all[["AGE", "PTGENDER", "PTEDUCAT"]].iloc[idx_train]
c_test = df_all[["AGE", "PTGENDER", "PTEDUCAT"]].iloc[idx_test]

In [17]:
random_state=42
n_imputation_iter = 10

# Define hyperparameters
gain_parameters = {
    'hint_rate': 0.9,
    'alpha': 100,
    'iterations': 1000
}

# Continuous Imputer List (list of tuples with unique strings and corresponding instances)
continuous_imputer_list = [
    ("NoImputer", KNNImputer(n_neighbors=1)),

]

# Ordinal Imputer List (list of tuples with unique strings and corresponding instances)
ordinal_imputer_list = [
    ("NoImputer", SimpleImputer(strategy="most_frequent")),
]

# Predictive Models List (list of tuples with unique strings and corresponding instances)
predictive_models_list = [
    ("LinearRegression", LinearRegression()),
    ("MultiTaskElasticNet", MultiTaskElasticNet()),
    ("MultiTaskElasticNet_tuned", MultiTaskElasticNet(**{'alpha': 0.01, 'l1_ratio': 0.01})),
    ("MultiTaskLasso", MultiTaskLasso()),
    ("MultiTaskLasso_tuned", MultiTaskLasso(**{'alpha': 0.001})),
    ("RandomForestRegressor", RandomForestRegressor()),
    ("XGBoostRegressor", XGBoostRegressor()),
    ("XGBoostRegressor_tuned", XGBoostRegressor(**{'colsample_bytree': 0.5079831261101071, 'learning_rate': 0.0769592094304232, 'max_depth': 6, 'min_child_weight': 7, 'subsample': 0.8049983288913105})),
    ("TabNetRegressor_default", TabNetModelWrapper(n_a=8, n_d=8)),
    ("TabNetRegressor_custom", TabNetModelWrapper(n_a=32, n_d=32)),
    ("PLSRegression_4_components", PLSRegression(n_components=4))
]


# Generate all combinations
combinations = list(product(continuous_imputer_list, ordinal_imputer_list, predictive_models_list))

# Display all combinations
for continuous_imputer, ordinal_imputer, model in combinations:
    print(f"Continuous Imputer: {continuous_imputer[0]}, Ordinal Imputer: {ordinal_imputer[0]}, Model: {model[0]}")

print(f"Combinations of preprocessing and models to test : {len(combinations)}")

Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: LinearRegression
Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: MultiTaskElasticNet
Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: MultiTaskElasticNet_tuned
Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: MultiTaskLasso
Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: MultiTaskLasso_tuned
Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: RandomForestRegressor
Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: XGBoostRegressor
Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: XGBoostRegressor_tuned
Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: TabNetRegressor_default
Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: TabNetRegressor_custom
Continuous Imputer: NoImputer, Ordinal Imputer: NoImputer, Model: PLSRegression_4_components
Combinations of preprocessing and model

In [18]:
# Initialize HDF5 file
results_file = '../pickle/training_2_dict_results.pickle'

with open(results_file, "rb") as input_file:
    all_dict_results = pickle.load(input_file)

In [19]:
for continuous_imputer, ordinal_imputer, model in combinations:
    name_continuous_imputer, continuous_imputer_instance = continuous_imputer
    name_ordinal_imputer, ordinal_imputer_instance = ordinal_imputer
    name_model, model_instance = model

    try: 
    
        # Now you can call your `train_model` function with these components
        dict_results = train_imputer_model(
            df_X_train, df_X_test, df_y_train, df_y_test,
            c_train, c_test,
            ordinal_imputer_instance, name_ordinal_imputer,
            continuous_imputer_instance, name_continuous_imputer,
            model_instance, name_model,
            separate_imputers=True  # Or however you want to specify
        )

    except Exception as e:  

        print(e)
    
        params = {
        "ordinal_imputer": name_ordinal_imputer, 
        "continuous_imputer": name_continuous_imputer, 
        "model": name_model, "train_shape" : df_X_train.shape, 
        "test_shape": df_X_test.shape
    }
        dict_results = {
        "params": params, 
        "imputation_time": None,
        "fitting_time": None, 
        "results_adj": None, 
        "results_org": None
    }
        
    # Optionally keep the all_dict_results list updated
    all_dict_results.append(dict_results)

    # Save the updated results back to the pickle file
    with open(results_file, 'wb') as f:
        pickle.dump(all_dict_results, f)

Using separate imputers for ordinal and continuous data.
No NaN in train data -> Keep as it is. 
No NaN in test data -> Keep as it is. 
Using separate imputers for ordinal and continuous data.
No NaN in train data -> Keep as it is. 
No NaN in test data -> Keep as it is. 
Using separate imputers for ordinal and continuous data.
No NaN in train data -> Keep as it is. 
No NaN in test data -> Keep as it is. 
Using separate imputers for ordinal and continuous data.
No NaN in train data -> Keep as it is. 
No NaN in test data -> Keep as it is. 
Using separate imputers for ordinal and continuous data.
No NaN in train data -> Keep as it is. 
No NaN in test data -> Keep as it is. 
Using separate imputers for ordinal and continuous data.
No NaN in train data -> Keep as it is. 
No NaN in test data -> Keep as it is. 
Using separate imputers for ordinal and continuous data.
No NaN in train data -> Keep as it is. 
No NaN in test data -> Keep as it is. 
Using separate imputers for ordinal and continuo

KeyboardInterrupt: 

In [20]:
# Store data (serialize)
with open(results_file, 'wb') as handle:
    pickle.dump(all_dict_results, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Add Pytorch models only on MRI features

In [21]:
ordinal_features = ['APOE_epsilon2', 'APOE_epsilon3', 'APOE_epsilon4']
continuous_features = [col for col in df_X_train.columns if col not in ordinal_features]

# Prepare Tabular configurations (shared for all PyTorch models)
data_config = DataConfig(
    target=df_y_train.columns.tolist(),
    continuous_cols=continuous_features,
    categorical_cols=ordinal_features
)
trainer_config = TrainerConfig(
    batch_size=1024, max_epochs=1, auto_lr_find=True,
    early_stopping="valid_loss", early_stopping_mode="min", early_stopping_patience=5,
    checkpoints="valid_loss", load_best=True, progress_bar="nones", accelerator="cpu"
)
optimizer_config = OptimizerConfig()
head_config = LinearHeadConfig(dropout=0.1).__dict__

In [22]:
predictive_models_list += [
    ("GatedAdditiveTreeEnsembleConfig_tab", 
    TabularModelWrapper(
        GatedAdditiveTreeEnsembleConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        gflu_stages=6,
        gflu_dropout=0.0,
        tree_depth=5,
        num_trees=20,
        chain_trees=False,
        share_head_weights=True), data_config, trainer_config, optimizer_config 
    )),
    ("DANetConfig_tab",
    TabularModelWrapper(
        DANetConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        n_layers=8,
        k=5,
        dropout_rate=0.1), data_config, trainer_config, optimizer_config
    )),
    ("TabTransformerConfig_tab",
        TabularModelWrapper(
        TabTransformerConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        embedding_initialization="kaiming_uniform",
        embedding_bias=False), data_config, trainer_config, optimizer_config
    )),
    # ("FTTransformerConfig",
    #     TabularModelWrapper(
    #     FTTransformerConfig(
    #     task="regression",
    #     head="LinearHead",
    #     head_config=head_config), data_config, trainer_config, optimizer_config
    # )),
    ("TabNetModelConfig_tab",
        TabularModelWrapper(
        TabNetModelConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        n_d=8,
        n_a=8,
        n_steps=3,
        gamma=1.3,
        n_independent=2,
        n_shared=2), data_config, trainer_config, optimizer_config
    )),
]

# Generate all combinations
combinations = list(product(continuous_imputer_list, ordinal_imputer_list, predictive_models_list))

for continuous_imputer, ordinal_imputer, model in combinations:
    name_continuous_imputer, continuous_imputer_instance = continuous_imputer
    name_ordinal_imputer, ordinal_imputer_instance = ordinal_imputer
    name_model, model_instance = model

    params = {
        "ordinal_imputer": name_ordinal_imputer, 
        "continuous_imputer": name_continuous_imputer, 
        "model": name_model, "train_shape" : df_X_train.shape, 
        "test_shape": df_X_test.shape
    }

    if any(result['params'] == params for result in all_dict_results):
        # Skip this iteration if the combination exists
        print(f"Skipping existing combination: {params.values()}")
        
        continue

    try: 
        print(name_model)
        
        # Now you can call your `train_model` function with these components
        dict_results = train_imputer_model(
            df_X_train, df_X_test, df_y_train, df_y_test,
            c_train, c_test,
            ordinal_imputer_instance, name_ordinal_imputer,
            continuous_imputer_instance, name_continuous_imputer,
            model_instance, name_model,
            separate_imputers=True  # Or however you want to specify
        )

    except Exception as e:  

        print(e)
    
        dict_results = {
        "params": params, 
        "imputation_time": None,
        "fitting_time": None, 
        "results_adj": None, 
        "results_org": None
    }
        
    # Optionally keep the all_dict_results list updated
    all_dict_results.append(dict_results)

        # Save the updated results back to the pickle file
    with open(results_file, 'wb') as f:
        pickle.dump(all_dict_results, f)

Skipping existing combination: dict_values(['NoImputer', 'NoImputer', 'LinearRegression', (2881, 200), (13, 200)])
Skipping existing combination: dict_values(['NoImputer', 'NoImputer', 'MultiTaskElasticNet', (2881, 200), (13, 200)])
Skipping existing combination: dict_values(['NoImputer', 'NoImputer', 'MultiTaskElasticNet_tuned', (2881, 200), (13, 200)])
Skipping existing combination: dict_values(['NoImputer', 'NoImputer', 'MultiTaskLasso', (2881, 200), (13, 200)])
Skipping existing combination: dict_values(['NoImputer', 'NoImputer', 'MultiTaskLasso_tuned', (2881, 200), (13, 200)])
Skipping existing combination: dict_values(['NoImputer', 'NoImputer', 'RandomForestRegressor', (2881, 200), (13, 200)])
Skipping existing combination: dict_values(['NoImputer', 'NoImputer', 'XGBoostRegressor', (2881, 200), (13, 200)])
Skipping existing combination: dict_values(['NoImputer', 'NoImputer', 'XGBoostRegressor_tuned', (2881, 200), (13, 200)])
Skipping existing combination: dict_values(['NoImputer'

Seed set to 42


"None of [Index(['APOE_epsilon2', 'APOE_epsilon3', 'APOE_epsilon4'], dtype='object')] are in the [columns]"
DANetConfig_tab
Using separate imputers for ordinal and continuous data.
No NaN in train data -> Keep as it is. 
No NaN in test data -> Keep as it is. 


Seed set to 42


"None of [Index(['APOE_epsilon2', 'APOE_epsilon3', 'APOE_epsilon4'], dtype='object')] are in the [columns]"
TabTransformerConfig_tab
Using separate imputers for ordinal and continuous data.
No NaN in train data -> Keep as it is. 
No NaN in test data -> Keep as it is. 


Seed set to 42


"None of [Index(['APOE_epsilon2', 'APOE_epsilon3', 'APOE_epsilon4'], dtype='object')] are in the [columns]"
TabNetModelConfig_tab
Using separate imputers for ordinal and continuous data.
No NaN in train data -> Keep as it is. 
No NaN in test data -> Keep as it is. 


Seed set to 42


"None of [Index(['APOE_epsilon2', 'APOE_epsilon3', 'APOE_epsilon4'], dtype='object')] are in the [columns]"


# Print Table for reporting

In [23]:
results_file = "../pickle/training_2_dict_results.pickle"

In [24]:
with open(results_file, "rb") as input_file:
    all_dict_results = pickle.load(input_file)

In [25]:
import pandas as pd
import numpy as np

def generate_metric_table(
    results_list,
    targets,
    metric_name,
    source="Adjusted",
    float_format="%.3f",
    csv_filename=None,
    sort_order="ascending"  # or "descending"
):
    """
    Create a LaTeX table for a single metric across targets, models, and imputers.
    Optionally export the same table as CSV and sort by mean performance.

    Parameters
    ----------
    results_list : list of dict
        List of experiment results.
    targets : list of str
        Target names (e.g., ['ADNI_MEM', 'ADNI_EF', 'ADNI_VS', 'ADNI_LAN']).
    metric_name : str
        Metric to extract (e.g., 'mae_score').
    source : str
        'Adjusted' or 'Original'.
    float_format : str
        Format for floats (e.g., '%.3f').
    csv_filename : str or None
        If provided, saves the table to CSV.
    sort_order : str
        'ascending' or 'descending' for sorting by mean.

    Returns
    -------
    str
        LaTeX-formatted table string.
    """
    rows = []
    version_key = "results_adj" if source.lower() == "adjusted" else "results_org"

    for res in results_list:
        result_block = res.get(version_key)
        if result_block is None:
            continue

        metric_values = result_block.get(metric_name)
        if metric_values is None:
            continue

        if len(metric_values) != len(targets):
            continue

        ordinal_imputer = res["params"].get("ordinal_imputer")
        model = res["params"].get("model")

        values = np.array(metric_values, dtype=np.float64)
        mean_val = np.mean(values)
        std_val = np.std(values)

        row = {
            "Ordinal Imputer": ordinal_imputer,
            "Model": model,
            "Mean": mean_val,  # for sorting
            "Mean ± SD": f"{mean_val:.3f} ± {std_val:.3f}",
        }
        row.update({target: val for target, val in zip(targets, values)})
        rows.append(row)

    df = pd.DataFrame(rows)

    # Reorder columns for display
    display_cols = ["Ordinal Imputer", "Model"] + targets + ["Mean ± SD"]
    df = df.sort_values(by="Mean", ascending=(sort_order == "ascending"))
    df = df[display_cols]

    # Save CSV
    if csv_filename:
        df.to_csv(csv_filename, index=False)

    # LaTeX output
    latex_table = df.to_latex(
        index=False,
        escape=False,
        float_format=float_format,
        caption=f"{metric_name.replace('_', ' ').upper()} across targets",
        label=f"tab:{metric_name}",
        longtable=False
    )

    return df, latex_table


In [26]:
latex_df, latex_mae = generate_metric_table(
    results_list=all_dict_results,
    targets=['ADNI_MEM', 'ADNI_EF', 'ADNI_VS', 'ADNI_LAN'],
    metric_name='mae_score',
    source="Adjusted",
    csv_filename="../tables/2_training_train_test_mae_adjusted_sorted.csv",
    sort_order="ascending"
)
print(latex_mae)

\begin{table}
\caption{MAE SCORE across targets}
\label{tab:mae_score}
\begin{tabular}{llrrrrl}
\toprule
Ordinal Imputer & Model & ADNI_MEM & ADNI_EF & ADNI_VS & ADNI_LAN & Mean ± SD \\
\midrule
SimpleImputer_constant & TabNetRegressor_default & 0.620 & 0.593 & 0.549 & 0.476 & 0.559 ± 0.054 \\
SimpleImputer_constant & LinearRegression & 0.652 & 0.472 & 0.613 & 0.624 & 0.590 ± 0.070 \\
KNNImputer & LinearRegression & 0.651 & 0.470 & 0.612 & 0.629 & 0.590 ± 0.071 \\
SimpleImputer_most_frequent & LinearRegression & 0.656 & 0.470 & 0.617 & 0.629 & 0.593 ± 0.072 \\
KNNImputer & TabNetRegressor_custom & 0.735 & 0.519 & 0.485 & 0.649 & 0.597 ± 0.100 \\
SimpleImputer_constant & LinearRegression & 0.685 & 0.523 & 0.607 & 0.641 & 0.614 ± 0.059 \\
KNNImputer & LinearRegression & 0.683 & 0.521 & 0.606 & 0.645 & 0.614 ± 0.060 \\
SimpleImputer_most_frequent & LinearRegression & 0.688 & 0.522 & 0.610 & 0.645 & 0.616 ± 0.061 \\
SimpleImputer_most_frequent & TabNetRegressor_default & 0.623 & 0.676 & 0.

In [27]:
latex_df.Model.value_counts()

Model
LinearRegression               11
RandomForestRegressor          11
XGBoostRegressor               11
MultiTaskElasticNet            11
MultiTaskLasso                 11
TabNetRegressor_default        10
TabNetRegressor_custom         10
PLSRegression_4_components     10
MultiTaskLasso_tuned            4
MultiTaskElasticNet_tuned       4
XGBoostRegressor_tuned          3
PLSRegression_10_components     1
PLSRegression_2_components      1
Name: count, dtype: int64