In [None]:
# Basic imports
import os
import sys
import time
import pickle
from itertools import product
import warnings

# System path modification
sys.path.insert(0, '..')

# Data handling
import pandas as pd
import numpy as np
from scipy.stats import uniform, randint

# Machine learning imports
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import (
    LinearRegression, MultiTaskLasso, MultiTaskElasticNet
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import train_test_split

# Custom modules
from src.train import *
from src.functions import *
from src.plots import *
from src.dataset import *
from src.multixgboost import *
from src.wrapper import *

# Deep learning and machine learning specific 
import torch
from pytorch_tabnet.tab_model import TabNetRegressor
from pytorch_tabular import model_sweep
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig

# Ignore warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Print CUDA availability for PyTorch
print(torch.cuda.is_available())
print(torch.cuda.device_count())

False
0




# Hyperparameter Tuning and Model Prototyping

This notebook performs hyperparameter optimization across multiple model types using a random subset of patients selected from the incomplete cases, ensuring no overlap with the final evaluation dataset. It also prototypes PyTorch Tabular models for integration into the OPTIMUS pipeline.

The workflow includes:

- Hyperparameter search for linear models (MultiTaskElasticNet, MultiTaskLasso), Partial Least Squares (PLS), and XGBoost.
- Model sweep and prototyping using PyTorch Tabular to ensure compatibility with the existing pipeline.
- Saving tuned configurations for reuse in downstream cross-validation and benchmarking.

All procedures are applied on training subsets to prevent data leakage, with results stored in structured outputs for reproducibility.

--- 

### Notebook structure

1. Load Data

2. Train-Test Split

3. Hyperparameter Search
    1. Linear Models
    2. Partial Least Squares Regression
    3. XGBoost
    4. PyTorch Tabular

4. PyTorch Tabular

## Load data 

In [2]:
data = load_pickle_data_palettes()

results_pickle_folder = "../pickle/"

# Unpack data
df_X, df_y, df_all, df_FinalCombination = data["df_X"], data["df_y"], data["df_all"], data["df_FinalCombination"]
dict_select = data["dict_select"]

# Unpack colormaps
full_palette, gender_palette, dx_palette = data["colormaps"].values()

# Train-Test Split

In [None]:
idx_train = list(df_X.isna().any(axis=1))
idx_test = list(~df_X.isna().any(axis=1))

set_intersect_rid = set(df_all[idx_train].RID).intersection(set(df_all[idx_test].RID))
intersect_rid_idx = df_all.RID.isin(set_intersect_rid)

for i, bool_test in enumerate(idx_test): 
    if intersect_rid_idx.iloc[i] & bool_test:
        idx_test[i] = False
        idx_train[i] = True
        
df_X_train = df_X.loc[idx_train]
df_X_test = df_X.loc[idx_test]

df_y_train = df_y.loc[idx_train]
df_y_test = df_y.loc[idx_test]

c_train = df_all[["AGE", "PTGENDER", "PTEDUCAT"]].iloc[idx_train]
c_test = df_all[["AGE", "PTGENDER", "PTEDUCAT"]].iloc[idx_test]

In [35]:
ordinal_features = ['APOE_epsilon2', 'APOE_epsilon3', 'APOE_epsilon4']
continuous_features = [col for col in df_X_train.columns if col not in ordinal_features]

# Imputation pipelines
ordinal_imputer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

continuous_imputer = Pipeline([
    ('imputer', KNNImputer())
])

# Combine with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('ord', ordinal_imputer, ordinal_features),
        ('cont', continuous_imputer, continuous_features)
    ]
)

# Full X pipeline
X_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    # ('scaler', StandardScaler())
])

df_X_train_imputed = df_X_train.copy()
df_X_test_imputed = df_X_test.copy()

# Fit-transform X once
df_X_train_imputed[ordinal_features+continuous_features] = X_pipeline.fit_transform(df_X_train)
df_X_test_imputed[ordinal_features+continuous_features]  = X_pipeline.transform(df_X_test)

# Demographics adjustment for y
demographic_adjustment_y = DemographicAdjustmentTransformer()
y_train_adjusted = demographic_adjustment_y.fit_transform(df_y_train, c_train)
y_test_adjusted = demographic_adjustment_y.transform(df_y_test, c_test)

# Demographics adjustment for X
demographic_adjustment_X = DemographicAdjustmentTransformer(categorical_columns=ordinal_features)
X_train_adjusted = demographic_adjustment_X.fit_transform(df_X_train_imputed, c_train)
X_test_adjusted = demographic_adjustment_X.transform(df_X_test_imputed, c_test)

# Standardize only continuous features
scaler = StandardScaler()

X_train_adjusted[continuous_features] = scaler.fit_transform(X_train_adjusted[continuous_features])
X_test_adjusted[continuous_features] = scaler.transform(X_test_adjusted[continuous_features])

# Hyperparameter search 

## Linear models

In [7]:
enet = MultiTaskElasticNet(max_iter=10000)

param_grid_enet = {
    'alpha': [1e-4, 1e-3, 0.01, 0.1, 1.0, 10.0, 100.0],
    'l1_ratio': [0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 1.0]
}

grid_search_enet = GridSearchCV(
    estimator=enet,
    param_grid=param_grid_enet,
    cv=5,
    scoring='r2',
    n_jobs=-1
)

grid_search_enet.fit(X_train_adjusted, y_train_adjusted)
best_enet = grid_search_enet.best_estimator_

print("Best ElasticNet Parameters:", grid_search_enet.best_params_)

Best ElasticNet Parameters: {'alpha': 0.1, 'l1_ratio': 0.1}


In [8]:
lasso = MultiTaskLasso(max_iter=10000)

param_grid_lasso = {
    'alpha': [1e-4, 1e-3, 1e-2, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]
}

grid_search_lasso = GridSearchCV(
    estimator=lasso,
    param_grid=param_grid_lasso,
    cv=5,
    scoring='r2',
    n_jobs=-1
)

grid_search_lasso.fit(X_train_adjusted, y_train_adjusted)
best_lasso = grid_search_lasso.best_estimator_

print("Best Lasso Parameters:", grid_search_lasso.best_params_)


Best Lasso Parameters: {'alpha': 0.01}


## Partial Least Squares Regression

In [9]:
# Initialize the estimator
pls = PLSRegression()

# Define the parameter grid
param_grid_pls = {
    'n_components': list(range(1, min(X_train_adjusted.shape[1], y_train_adjusted.shape[1]) + 1))
}

# Set up the grid search
grid_search_pls = GridSearchCV(
    estimator=pls,
    param_grid=param_grid_pls,
    cv=5,  # Use cross-validation
    scoring='r2',  # Can be changed to 'neg_mean_squared_error' or others
    n_jobs=-1,
    verbose=1
)

# Fit the model
grid_search_pls.fit(X_train_adjusted, y_train_adjusted)

# Extract the best model
best_pls = grid_search_pls.best_estimator_
print("Best PLS Parameters:", grid_search_pls.best_params_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best PLS Parameters: {'n_components': 4}


## XGBoost

In [36]:
from sklearn.base import BaseEstimator, RegressorMixin
import xgboost as xgb

class XGBoostRegressor(BaseEstimator, RegressorMixin):
    def __init__(self,
                 num_boost_round=100,
                 max_depth=3,
                 lambda_=1.0,
                 learning_rate=0.1,
                 subsample=1.0,
                 colsample_bytree=1.0,
                 alpha=0.0,
                 tree_method="hist",
                 custom_obj=True,
                 custom_metric=True):
        
        # Expose all as attributes
        self.num_boost_round = num_boost_round
        self.max_depth = max_depth
        self.lambda_ = lambda_
        self.learning_rate = learning_rate
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.alpha = alpha
        self.tree_method = tree_method
        self.custom_obj = custom_obj
        self.custom_metric = custom_metric
        
        self.model = None

    def get_params(self, deep=True):
        # Return all parameters
        return {
            'num_boost_round': self.num_boost_round,
            'max_depth': self.max_depth,
            'lambda_': self.lambda_,
            'learning_rate': self.learning_rate,
            'subsample': self.subsample,
            'colsample_bytree': self.colsample_bytree,
            'alpha': self.alpha,
            'tree_method': self.tree_method,
            'custom_obj': self.custom_obj,
            'custom_metric': self.custom_metric
        }

    def set_params(self, **params):
        for key, val in params.items():
            setattr(self, key, val)
        return self

    def fit(self, X, y):
        dtrain = xgb.DMatrix(data=X, label=y, enable_categorical=True)
        params = {
            'max_depth': self.max_depth,
            'lambda': self.lambda_,
            'learning_rate': self.learning_rate,
            'subsample': self.subsample,
            'colsample_bytree': self.colsample_bytree,
            'alpha': self.alpha,
            'tree_method': self.tree_method,
            'num_target': y.shape[1]
        }
        obj_fn = squared_log if self.custom_obj else 'reg:squarederror'
        metric_fn = rmse if self.custom_metric else None

        self.model = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=self.num_boost_round,
            obj=obj_fn,
            custom_metric=metric_fn
        )
        return self

    def predict(self, X):
        dtest = xgb.DMatrix(data=X, enable_categorical=True)
        return self.model.predict(dtest)


In [38]:
param_dist = {
    'learning_rate': uniform(0.01, 0.29),
    'max_depth': randint(3, 10),
    'min_child_weight': randint(1, 10),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5)
}

rand_search_xgb = RandomizedSearchCV(
    estimator=XGBoostRegressor(tree_method="hist", custom_obj=True, custom_metric=True),
    param_distributions=param_dist,
    n_iter=50,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

X_train_adjusted[ordinal_features] = X_train_adjusted[ordinal_features].astype("int")
X_test_adjusted[ordinal_features] = X_test_adjusted[ordinal_features].astype("int")

rand_search_xgb.fit(X_train_adjusted, y_train_adjusted)
print("Best XGBoost Params:", rand_search_xgb.best_params_)
best_xgb = rand_search_xgb.best_estimator_

Fitting 5 folds for each of 50 candidates, totalling 250 fits


135 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/home/cschneuwly/.cache/pypoetry/virtualenvs/optimus-iOcAib6k-py3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/tmp/ipykernel_1475738/2137379911.py", line 52, in fit
  File "/home/cschneuwly/.cache/pypoetry/virtualenvs/optimus-iOcAib6k-py3.12/lib/python3.12/site-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "/home/cschneuwly/.cache/pypoetry/virtualenvs/optimus-iOcAib6k-py3.12/lib/python3.12/site-pac

Best XGBoost Params: {'colsample_bytree': 0.8776807051588262, 'learning_rate': 0.13329520360246094, 'max_depth': 8, 'min_child_weight': 4, 'subsample': 0.5924272277627636}


In [39]:
print("Best XGBoost Params:", rand_search_xgb.best_params_)
best_xgb = rand_search_xgb.best_estimator_

Best XGBoost Params: {'colsample_bytree': 0.8776807051588262, 'learning_rate': 0.13329520360246094, 'max_depth': 8, 'min_child_weight': 4, 'subsample': 0.5924272277627636}


In [28]:
class XGBoostRegressor(BaseEstimator, RegressorMixin):
    def __init__(self,
                 num_boost_round=100,
                 max_depth=3,
                 lambda_=1.0,
                 learning_rate=0.1,
                 subsample=1.0,
                 colsample_bytree=1.0,
                 alpha=0.0,
                 tree_method="hist",
                 custom_obj=True,
                 custom_metric=True):
        
        # Expose all as attributes
        self.num_boost_round = num_boost_round
        self.max_depth = max_depth
        self.lambda_ = lambda_
        self.learning_rate = learning_rate
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.alpha = alpha
        self.tree_method = tree_method
        self.custom_obj = custom_obj
        self.custom_metric = custom_metric
        
        self.model = None

    def get_params(self, deep=True):
        # Return all parameters
        return {
            'num_boost_round': self.num_boost_round,
            'max_depth': self.max_depth,
            'lambda_': self.lambda_,
            'learning_rate': self.learning_rate,
            'subsample': self.subsample,
            'colsample_bytree': self.colsample_bytree,
            'alpha': self.alpha,
            'tree_method': self.tree_method,
            'custom_obj': self.custom_obj,
            'custom_metric': self.custom_metric
        }

    def set_params(self, **params):
        for key, val in params.items():
            setattr(self, key, val)
        return self

    def fit(self, X, y):
        dtrain = xgb.DMatrix(data=X, label=y, enable_categorical=True)
        params = {
            'max_depth': self.max_depth,
            'lambda': self.lambda_,
            'learning_rate': self.learning_rate,
            'subsample': self.subsample,
            'colsample_bytree': self.colsample_bytree,
            'alpha': self.alpha,
            'tree_method': self.tree_method,
            'num_target': y.shape[1]
        }
        obj_fn = squared_log if self.custom_obj else 'reg:squarederror'
        metric_fn = rmse if self.custom_metric else None

        self.model = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=self.num_boost_round,
            obj=obj_fn,
            custom_metric=metric_fn
        )
        return self

    def predict(self, X):
        dtest = xgb.DMatrix(data=X, enable_categorical=True)
        return self.model.predict(dtest)


In [21]:
XGBoostRegressor({'colsample_bytree': 0.5079831261101071, 'learning_rate': 0.0769592094304232, 'max_depth': 6, 'min_child_weight': 7, 'subsample': 0.8049983288913105})

##  Pytorch Tabular

In [22]:
targets=["ADNI_MEM", "ADNI_EF", "ADNI_VS", "ADNI_LAN"]

data_config = DataConfig(
    target=targets,
    continuous_cols=continuous_features,
    categorical_cols=ordinal_features
)

trainer_config = TrainerConfig(
    batch_size=32,
    max_epochs=100,
    auto_lr_find=True,
    early_stopping="valid_loss",          # Monitor validation loss
    early_stopping_mode="min",           # Stop when it stops decreasing
    early_stopping_patience=5,           # Wait up to 5 epochs with no improvement
    early_stopping_min_delta=0.001,      # Minimum change to qualify as an improvement
    checkpoints="valid_loss",            # Save best model on val_loss
    checkpoints_mode="min",
    checkpoints_save_top_k=1,
    load_best=True,                      # Load best model post-training
    progress_bar="simple",               # Basic progress bar (or "none")
    trainer_kwargs=dict(enable_model_summary=False),
    accelerator="cpu"                    # Change to "gpu" if available
)

optimizer_config = OptimizerConfig()

In [23]:
from pytorch_tabular.models.common.heads import LinearHeadConfig
from pytorch_tabular import MODEL_SWEEP_PRESETS

print(list(MODEL_SWEEP_PRESETS.keys()))

head_config = LinearHeadConfig(
    layers="",
    dropout=0.1,
    initialization=(  # No additional layer in head, just a mapping layer to output_dim
        "kaiming"
    ),
).__dict__  # Convert to dict to pass to the model config (OmegaConf doesn't accept objects)

['lite', 'standard', 'full', 'high_memory']


In [24]:
print(list(MODEL_SWEEP_PRESETS["standard"]))

[('CategoryEmbeddingModelConfig', {'layers': '256-128-64'}), ('CategoryEmbeddingModelConfig', {'layers': '512-128-64'}), ('GANDALFConfig', {'gflu_stages': 6}), ('GANDALFConfig', {'gflu_stages': 15}), ('TabNetModelConfig', {'n_d': 32, 'n_a': 32, 'n_steps': 3, 'gamma': 1.5, 'n_independent': 1, 'n_shared': 2}), ('TabNetModelConfig', {'n_d': 32, 'n_a': 32, 'n_steps': 5, 'gamma': 1.5, 'n_independent': 2, 'n_shared': 3}), ('FTTransformerConfig', {'num_heads': 4, 'num_attn_blocks': 4})]


In [25]:
df_Xy_preprocessed_adjusted = pd.DataFrame(np.concatenate([X_train_adjusted, y_train_adjusted], axis=1), columns = ordinal_features+continuous_features+targets)
train, test = train_test_split(df_Xy_preprocessed_adjusted, random_state=42, test_size=0.2)
print(f"Train Shape: {train.shape} | Test Shape: {test.shape}")

Train Shape: (2304, 260) | Test Shape: (577, 260)


In [26]:
metrics = [
    #"r2_score",
    "mean_absolute_error",
    "explained_variance",
]

# Provide an empty dict for each metric (no extra params needed):
metrics_params = [{}, {}]

# For regression, all metrics are computed on predictions themselves:
metrics_prob_input = [False] * len(metrics)

from omegaconf import DictConfig
torch.serialization.safe_globals([DictConfig])

# Filtering out the warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    sweep_df, best_model = model_sweep(
        task="regression",  # One of "classification", "regression"
        train=train,
        test=test,
        data_config=data_config,
        optimizer_config=optimizer_config,
        trainer_config=trainer_config,
        model_list="full",
        common_model_args=dict(head="LinearHead", head_config=head_config),
        metrics = metrics,
        metrics_params=metrics_params,
        metrics_prob_input=metrics_prob_input,
        rank_metric=("mean_absolute_error", "higher_is_better"),
        progress_bar=True,
        verbose=False,
        suppress_lightning_logger=True,
    )

Output()

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

NameError: name 'exit' is not defined

In [None]:
sweep_df.sort_values("test_mean_absolute_error_0")

Unnamed: 0,model,# Params,epochs,test_loss_0,test_loss_1,test_loss_2,test_loss_3,test_loss,test_mean_absolute_error_0,test_mean_absolute_error_1,...,test_mean_absolute_error_3,test_mean_absolute_error,test_explained_variance_0,test_explained_variance_1,test_explained_variance_2,test_explained_variance_3,test_explained_variance,time_taken,time_taken_per_epoch,params
5,GatedAdditiveTreeEnsembleModel,7 M,11,0.32399,0.542354,0.560329,0.408959,1.835631,0.437651,0.571564,...,0.49284,2.101103,0.625733,0.485292,0.153444,0.506565,1.771033,346.253147,31.477559,"{'task': 'regression', 'head': 'LinearHead', '..."
2,DANetModel,2 M,16,0.354974,0.540967,0.669353,0.441169,2.006463,0.4579,0.564277,...,0.517689,2.191942,0.595836,0.488291,-0.013647,0.489327,1.559807,151.719178,9.482449,"{'task': 'regression', 'head': 'LinearHead', '..."
1,CategoryEmbeddingModel,263 T,9,0.355338,0.61213,0.602721,0.433786,2.003975,0.46319,0.613986,...,0.515381,2.206153,0.60128,0.441098,0.099351,0.485393,1.627123,21.759632,2.417737,"{'task': 'regression', 'head': 'LinearHead', '..."
7,TabTransformerModel,400 T,13,0.380481,0.766874,0.606936,0.507441,2.261732,0.47753,0.670824,...,0.556025,2.317068,0.584447,0.366383,0.095381,0.399458,1.445669,62.998656,4.84605,"{'task': 'regression', 'head': 'LinearHead', '..."
3,FTTransformerModel,416 T,20,0.427512,0.608372,0.574697,0.530267,2.140848,0.507553,0.613245,...,0.573561,2.309888,0.525614,0.432071,0.133033,0.392044,1.482762,4357.219138,217.860957,"{'task': 'regression', 'head': 'LinearHead', '..."
4,GANDALFModel,6 M,100,0.522728,0.706339,0.603901,0.59732,2.430287,0.577457,0.658918,...,0.610478,2.468199,0.425688,0.343637,0.103546,0.327892,1.200763,573.442258,5.734423,"{'task': 'regression', 'head': 'LinearHead', '..."
0,AutoIntModel,121 T,85,0.567992,0.729492,0.633248,0.626819,2.55755,0.607697,0.660484,...,0.629624,2.534326,0.367111,0.324409,0.028654,0.28939,1.009563,700.243122,8.238154,"{'task': 'regression', 'head': 'LinearHead', '..."
6,TabNetModel,227 T,5,0.897485,1.073166,0.678909,0.881682,3.531243,0.782892,0.807683,...,0.743692,2.992384,0.002445,0.001495,0.000981,0.001422,0.006342,33.088309,6.617662,"{'task': 'regression', 'head': 'LinearHead', '..."


### Ensure match with pipeline

In [None]:
random_state=42
n_imputation_iter = 10

# Continuous Imputer List (list of tuples with unique strings and corresponding instances)
continuous_imputer_list = [
    ("KNNImputer", KNNImputer(n_neighbors=1)),
]

# Ordinal Imputer List (list of tuples with unique strings and corresponding instances)
ordinal_imputer_list = [
    ("SimpleImputer_constant", SimpleImputer(strategy="constant", fill_value=-1))
]

# Predictive Models List (list of tuples with unique strings and corresponding instances)
predictive_models_list = [
    ("LinearRegression", LinearRegression()),
]

# Generate all combinations
combinations = list(product(continuous_imputer_list, ordinal_imputer_list, predictive_models_list))

# Display all combinations
for continuous_imputer, ordinal_imputer, model in combinations:
    print(f"Continuous Imputer: {continuous_imputer[0]}, Ordinal Imputer: {ordinal_imputer[0]}, Model: {model[0]}")

print(f"Combinations of preprocessing and models to test : {len(combinations)}")

Continuous Imputer: KNNImputer, Ordinal Imputer: SimpleImputer_constant, Model: LinearRegression
Combinations of preprocessing and models to test : 1


In [None]:
# Initialize HDF5 file
results_file = '../pickle/test.pickle'

if os.path.exists(results_file): 

    with open(results_file, "rb") as input_file:
        all_dict_results = pickle.load(input_file)

else : 
    all_dict_results = []
            
for continuous_imputer, ordinal_imputer, model in combinations:
    name_continuous_imputer, continuous_imputer_instance = continuous_imputer
    name_ordinal_imputer, ordinal_imputer_instance = ordinal_imputer
    name_model, model_instance = model

    params = {
        "ordinal_imputer": name_ordinal_imputer, 
        "continuous_imputer": name_continuous_imputer, 
        "model": name_model, "train_shape" : df_X_train.shape, 
        "test_shape": df_X_test.shape
    }

    if any(result['params'] == params for result in all_dict_results):
        # Skip this iteration if the combination exists
        print(f"Skipping existing combination: {params.values()}")
        
        continue

    try: 
    
        # Now you can call your `train_model` function with these components
        dict_results = train_imputer_model(
            df_X_train, df_X_test, df_y_train, df_y_test,
            c_train, c_test,
            ordinal_imputer_instance, name_ordinal_imputer,
            continuous_imputer_instance, name_continuous_imputer,
            model_instance, name_model,
            separate_imputers=True  # Or however you want to specify
        )

    except Exception as e:  

        print(e)
    
        dict_results = {
        "params": params, 
        "imputation_time": None,
        "fitting_time": None, 
        "results_adj": None, 
        "results_org": None
    }
        
    # Optionally keep the all_dict_results list updated
    all_dict_results.append(dict_results)

        # Save the updated results back to the pickle file
    with open(results_file, 'wb') as f:
        pickle.dump(all_dict_results, f)


Skipping existing combination: dict_values(['SimpleImputer_constant', 'KNNImputer', 'LinearRegression', (2881, 348), (13, 348)])


In [None]:
from pytorch_tabular import TabularModel
from pytorch_tabular.models import (
    GatedAdditiveTreeEnsembleConfig,
    DANetConfig,
    TabTransformerConfig,
    FTTransformerConfig,
    TabNetModelConfig,
)
from pytorch_tabular.config import DataConfig, TrainerConfig, OptimizerConfig
from pytorch_tabular.models.common.heads import LinearHeadConfig

# Prepare Tabular configurations (shared for all PyTorch models)
data_config = DataConfig(
    target=df_y_train.columns.tolist(),
    continuous_cols=continuous_features,
    categorical_cols=ordinal_features
)
trainer_config = TrainerConfig(
    batch_size=1024, max_epochs=1, auto_lr_find=True,
    early_stopping="valid_loss", early_stopping_mode="min", early_stopping_patience=5,
    checkpoints="valid_loss", load_best=True, progress_bar="nones", accelerator="cpu"
)
optimizer_config = OptimizerConfig()
head_config = LinearHeadConfig(dropout=0.1).__dict__

# Utility to wrap TabularModel into sklearn-like fit/predict
class TabularModelWrapper:
    def __init__(self, model_config, data_config, trainer_config, optimizer_config):
        self.model_config = model_config
        self.data_config = data_config
        self.trainer_config = trainer_config
        self.optimizer_config = optimizer_config
        self.model = None

    def fit(self, X, y):
        df = X.copy().reset_index(drop=True)
        for c in y.columns:
            df[c] = y[c].reset_index(drop=True)

        self.model = TabularModel(
            data_config=self.data_config,
            model_config=self.model_config,
            optimizer_config=self.optimizer_config,
            trainer_config=self.trainer_config
        )
        self.model.fit(train=df, validation=df)  # ideally separate validation
        return self

    def predict(self, X):
        assert self.model is not None, "You must call .fit(...) before .predict(...)"
        preds = self.model.predict(X.reset_index(drop=True))
        return preds[[f"{c}_prediction" for c in df_y_train.columns]].values

# Add PyTorch models to list
head_config = LinearHeadConfig(dropout=0.1).__dict__

predictive_models_list += [
    ("GatedAdditiveTreeEnsembleConfig_tab", 
    TabularModelWrapper(
        GatedAdditiveTreeEnsembleConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        gflu_stages=6,
        gflu_dropout=0.0,
        tree_depth=5,
        num_trees=20,
        chain_trees=False,
        share_head_weights=True), data_config, trainer_config, optimizer_config 
    )),
    ("DANetConfig_tab",
    TabularModelWrapper(
        DANetConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        n_layers=8,
        k=5,
        dropout_rate=0.1), data_config, trainer_config, optimizer_config
    )),
    ("TabTransformerConfig_tab",
        TabularModelWrapper(
        TabTransformerConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        embedding_initialization="kaiming_uniform",
        embedding_bias=False), data_config, trainer_config, optimizer_config
    )),
    # ("FTTransformerConfig",
    #     TabularModelWrapper(
    #     FTTransformerConfig(
    #     task="regression",
    #     head="LinearHead",
    #     head_config=head_config), data_config, trainer_config, optimizer_config
    # )),
    ("TabNetModelConfig_tab",
        TabularModelWrapper(
        TabNetModelConfig(
        task="regression",
        head="LinearHead",
        head_config=head_config,
        n_d=8,
        n_a=8,
        n_steps=3,
        gamma=1.3,
        n_independent=2,
        n_shared=2), data_config, trainer_config, optimizer_config
    )),
]

# Generate all combinations
combinations = list(product(continuous_imputer_list, ordinal_imputer_list, predictive_models_list))

for continuous_imputer, ordinal_imputer, model in combinations:
    name_continuous_imputer, continuous_imputer_instance = continuous_imputer
    name_ordinal_imputer, ordinal_imputer_instance = ordinal_imputer
    name_model, model_instance = model

    params = {
        "ordinal_imputer": name_ordinal_imputer, 
        "continuous_imputer": name_continuous_imputer, 
        "model": name_model, "train_shape" : df_X_train.shape, 
        "test_shape": df_X_test.shape
    }

    if any(result['params'] == params for result in all_dict_results):
        # Skip this iteration if the combination exists
        print(f"Skipping existing combination: {params.values()}")
        
        continue

    try: 
        print(name_model)
        print(model_instance)
    
        # Now you can call your `train_model` function with these components
        dict_results = train_imputer_model(
            df_X_train, df_X_test, df_y_train, df_y_test,
            c_train, c_test,
            ordinal_imputer_instance, name_ordinal_imputer,
            continuous_imputer_instance, name_continuous_imputer,
            model_instance, name_model,
            separate_imputers=True  # Or however you want to specify
        )

    except Exception as e:  

        print(e)
    
        dict_results = {
        "params": params, 
        "imputation_time": None,
        "fitting_time": None, 
        "results_adj": None, 
        "results_org": None
    }
        
    # Optionally keep the all_dict_results list updated
    all_dict_results.append(dict_results)

        # Save the updated results back to the pickle file
    with open(results_file, 'wb') as f:
        pickle.dump(all_dict_results, f)


Skipping existing combination: dict_values(['SimpleImputer_constant', 'KNNImputer', 'LinearRegression', (2881, 348), (13, 348)])
Skipping existing combination: dict_values(['SimpleImputer_constant', 'KNNImputer', 'GatedAdditiveTreeEnsembleConfig_tab', (2881, 348), (13, 348)])
Skipping existing combination: dict_values(['SimpleImputer_constant', 'KNNImputer', 'DANetConfig_tab', (2881, 348), (13, 348)])
Skipping existing combination: dict_values(['SimpleImputer_constant', 'KNNImputer', 'TabTransformerConfig_tab', (2881, 348), (13, 348)])
Skipping existing combination: dict_values(['SimpleImputer_constant', 'KNNImputer', 'TabNetModelConfig_tab', (2881, 348), (13, 348)])


In [None]:
def generate_metric_table(
    results_list,
    targets,
    metric_name,
    source="Adjusted",
    float_format="%.3f",
    csv_filename=None,
    sort_order="ascending"  # or "descending"
):
    """
    Create a LaTeX table for a single metric across targets, models, and imputers.
    Optionally export the same table as CSV and sort by mean performance.

    Parameters
    ----------
    results_list : list of dict
        List of experiment results.
    targets : list of str
        Target names (e.g., ['ADNI_MEM', 'ADNI_EF', 'ADNI_VS', 'ADNI_LAN']).
    metric_name : str
        Metric to extract (e.g., 'mae_score').
    source : str
        'Adjusted' or 'Original'.
    float_format : str
        Format for floats (e.g., '%.3f').
    csv_filename : str or None
        If provided, saves the table to CSV.
    sort_order : str
        'ascending' or 'descending' for sorting by mean.

    Returns
    -------
    str
        LaTeX-formatted table string.
    """
    rows = []
    version_key = "results_adj" if source.lower() == "adjusted" else "results_org"

    for res in results_list:
        result_block = res.get(version_key)
        if result_block is None:
            continue

        metric_values = result_block.get(metric_name)
        if metric_values is None:
            continue

        if len(metric_values) != len(targets):
            continue

        ordinal_imputer = res["params"].get("ordinal_imputer")
        model = res["params"].get("model")

        values = np.array(metric_values, dtype=np.float64)
        mean_val = np.mean(values)
        std_val = np.std(values)

        row = {
            "Ordinal Imputer": ordinal_imputer,
            "Model": model,
            "Mean": mean_val,  # for sorting
            "Mean ± SD": f"{mean_val:.3f} ± {std_val:.3f}",
        }
        row.update({target: val for target, val in zip(targets, values)})
        rows.append(row)

    df = pd.DataFrame(rows)

    # Reorder columns for display
    display_cols = ["Ordinal Imputer", "Model"] + targets + ["Mean ± SD"]
    df = df.sort_values(by="Mean", ascending=(sort_order == "ascending"))
    df = df[display_cols]

    # Save CSV
    if csv_filename:
        df.to_csv(csv_filename, index=False)

    # LaTeX output
    latex_table = df.to_latex(
        index=False,
        escape=False,
        float_format=float_format,
        caption=f"{metric_name.replace('_', ' ').upper()} across targets",
        label=f"tab:{metric_name}",
        longtable=False
    )

    return df, latex_table


In [None]:
df, latex_table = generate_metric_table(all_dict_results, targets=dict_select["ADNI_cog"], metric_name="mae_score", source="Adjusted", csv_filename=None, sort_order="ascending")

In [None]:
df

Unnamed: 0,Ordinal Imputer,Model,ADNI_MEM,ADNI_EF,ADNI_VS,ADNI_LAN,Mean ± SD
4,SimpleImputer_constant,LinearRegression,0.659125,0.55894,0.565599,0.602758,0.597 ± 0.040
0,SimpleImputer_constant,LinearRegression,0.684685,0.522703,0.607183,0.640623,0.614 ± 0.059
3,SimpleImputer_constant,TabTransformerConfig_tab,0.835669,0.716633,0.761407,0.629682,0.736 ± 0.075
7,SimpleImputer_constant,TabTransformerConfig_tab,0.744698,0.940382,0.689909,0.811252,0.797 ± 0.093
2,SimpleImputer_constant,DANetConfig_tab,0.996245,0.749981,0.615216,0.928531,0.822 ± 0.150
8,SimpleImputer_constant,TabNetModelConfig_tab,0.992423,0.783072,0.621848,0.932718,0.833 ± 0.144
6,SimpleImputer_constant,DANetConfig_tab,1.025889,0.778495,0.616289,0.917141,0.834 ± 0.153
5,SimpleImputer_constant,GatedAdditiveTreeEnsembleConfig_tab,1.165538,0.740761,0.64517,0.807058,0.840 ± 0.197
1,SimpleImputer_constant,GatedAdditiveTreeEnsembleConfig_tab,0.957282,2.19972,1.690917,0.922638,1.443 ± 0.534
