In [1]:
from pytorch_tabular import model_sweep
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig

In [2]:
# Basic imports
import os
import sys
import time
import pickle
from itertools import product
import warnings

# System path modification
sys.path.insert(0, '..')

# Data handling
import pandas as pd
import numpy as np
from scipy.stats import uniform, randint

# Machine learning imports
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import (
    LinearRegression, Lasso, LassoCV, MultiTaskLasso, MultiTaskLassoCV,
    ElasticNet, ElasticNetCV, MultiTaskElasticNet, MultiTaskElasticNetCV
)
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.cross_decomposition import PLSRegression
from sklearn.inspection import permutation_importance

# Custom modules
from src.train import *
from src.functions import *
from src.plots import *
from src.dataset import *
from src.multixgboost import *
from src.wrapper import *

# Visualizatiokn 
import matplotlib.pyplot as plt
import seaborn as sns

# Deep learning and machine learning specific 
import torch
from pytorch_tabnet.tab_model import TabNetRegressor
import xgboost as xgb
import shap

# Ignore warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Print CUDA availability for PyTorch
print(torch.cuda.is_available())
print(torch.cuda.device_count())

False
0




## Load data 

In [3]:
data = load_pickle_data_palettes()

results_pickle_folder = "../pickle/"

# Unpack data
df_X, df_y, df_all, df_FinalCombination = data["df_X"], data["df_y"], data["df_all"], data["df_FinalCombination"]
df_select_features = data["df_select_features"]

# Unpack feature selections
select_RNA, select_CSF, select_gene, select_MRIthickness = df_select_features.T.values

# Unpack colormaps
full_palette, gender_palette, dx_palette = data["colormaps"].values()

# Train-Test Split

In [4]:
idx_train = list(df_X.isna().any(axis=1))
idx_test = list(~df_X.isna().any(axis=1))

set_intersect_rid = set(df_all[idx_train].RID).intersection(set(df_all[idx_test].RID))
intersect_rid_idx = df_all.RID.isin(set_intersect_rid)

for i, bool_test in enumerate(idx_test): 
    if intersect_rid_idx.iloc[i] & bool_test:
        idx_test[i] = False
        idx_train[i] = True
        
df_X[["APOE_epsilon2", "APOE_epsilon3", "APOE_epsilon4"]] = df_X[["APOE_epsilon2", "APOE_epsilon3", "APOE_epsilon4"]].astype("category")

df_X_train = df_X.loc[idx_train]
df_X_test = df_X.loc[idx_test]

df_y_train = df_y.loc[idx_train]
df_y_test = df_y.loc[idx_test]

c_train = df_all[["AGE", "PTGENDER", "PTEDUCAT"]].iloc[idx_train]
c_test = df_all[["AGE", "PTGENDER", "PTEDUCAT"]].iloc[idx_test]

In [5]:
ordinal_features = ['APOE_epsilon2', 'APOE_epsilon3', 'APOE_epsilon4']
continuous_features = [col for col in df_X_train.columns if col not in ordinal_features]

# Imputation pipelines
ordinal_imputer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

continuous_imputer = Pipeline([
    ('imputer', KNNImputer())
])

# Combine with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('ord', ordinal_imputer, ordinal_features),
        ('cont', continuous_imputer, continuous_features)
    ]
)

# Full X pipeline
X_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    # ('scaler', StandardScaler())
])

df_X_train_imputed = df_X_train.copy()
df_X_test_imputed = df_X_test.copy()

# Fit-transform X once
df_X_train_imputed[ordinal_features+continuous_features] = X_pipeline.fit_transform(df_X_train)
df_X_test_imputed[ordinal_features+continuous_features]  = X_pipeline.transform(df_X_test)

# Demographics adjustment for y
demographic_adjustment_y = DemographicAdjustmentTransformer()
y_train_adjusted = demographic_adjustment_y.fit_transform(df_y_train, c_train)
y_test_adjusted = demographic_adjustment_y.transform(df_y_test, c_test)

# Demographics adjustment for X
demographic_adjustment_X = DemographicAdjustmentTransformer(categorical_columns=ordinal_features)
X_train_adjusted = demographic_adjustment_X.fit_transform(df_X_train_imputed, c_train)
X_test_adjusted = demographic_adjustment_X.transform(df_X_test_imputed, c_test)

# Standardize only continuous features
scaler = StandardScaler()

X_train_adjusted[continuous_features] = scaler.fit_transform(X_train_adjusted[continuous_features])
X_test_adjusted[continuous_features] = scaler.transform(X_test_adjusted[continuous_features])

# Train function

In [6]:
def train_imputer_model(
    df_X_train, df_X_test, df_y_train, df_y_test,
    c_train, c_test,
    ordinal_model, name_ordinal_imputer, 
    continuous_model, name_continuous_imputer, 
    model, name_model, 
    imputer_model=None, name_imputer=None, 
    separate_imputers=True,
    ordinal_features = ['APOE_epsilon2', 'APOE_epsilon3', 'APOE_epsilon4']
): 
    # Define which columns are ordinal and which are continuous
    continuous_features = [col for col in df_X_train.columns if col not in ordinal_features]

    # Check if a general imputer model (like MissForest or MICEForest) is provided
    if imputer_model is not None and name_imputer is not None and not separate_imputers:
        # If `imputer_model` can handle both categorical and continuous data types
        print(f"Using general imputer model: {name_imputer}")

        # Ensure that ordinal columns are marked as categorical
        df_X_train = df_X_train.copy()
        df_X_test = df_X_test.copy()
        
        for col in ordinal_features:
            df_X_train[col] = df_X_train[col].astype("category")
            df_X_test[col] = df_X_test[col].astype("category")

        # Create a pipeline with the general imputer
        pipeline = Pipeline(steps=[
            (name_imputer, imputer_model)
        ])

        # Fit and transform the entire dataset with the general imputer
        pipeline.fit(df_X_train)
        X_train_imputed = pipeline.transform(df_X_train)
        X_test_imputed = pipeline.transform(df_X_test)

        # Convert transformed output back to DataFrame with original column names
        df_X_train_imputed = pd.DataFrame(X_train_imputed, columns=df_X_train.columns)
        df_X_test_imputed = pd.DataFrame(X_test_imputed, columns=df_X_test.columns)

    else:
        # Separate imputers for ordinal and continuous data
        print("Using separate imputers for ordinal and continuous data.")

        df_X_train = df_X_train.copy()
        df_X_test = df_X_test.copy()

        # Continuous Imputation Transformer (Example: SimpleImputer)
        continuous_imputer = Pipeline([
            (name_continuous_imputer, continuous_model),
        ])

        # Ordinal Imputation Transformer (Example: KNN Imputer)
        ordinal_imputer = Pipeline([
            (name_ordinal_imputer, ordinal_model)
        ])

        # Create a ColumnTransformer to apply the appropriate imputer to each type of variable
        preprocessor = ColumnTransformer(
            transformers=[
                ('ordinal', ordinal_imputer, ordinal_features),
                ('continuous', continuous_imputer, continuous_features)
            ],
            remainder='passthrough'
        )

        # Create the pipeline
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor)
        ])

         # Separate imputers for ordinal and continuous data

        # Fit and transform, then convert back to DataFrame with original column names

        if df_X_train.isna().any().any():
                
            start = time.time()
            pipeline.fit(df_X_train)
            end = time.time()

            impute_model_time = end - start

            X_train_imputed = pipeline.transform(df_X_train)
            df_X_train_imputed = df_X_train.copy()
            df_X_train_imputed[ordinal_features+continuous_features] = X_train_imputed
        else :
            print("No NaN in train data -> Keep as it is. ")
            df_X_train_imputed = df_X_train
            
            impute_model_time = None

        # Transform the test set
        if df_X_test.isna().any().any(): 
            X_test_imputed = pipeline.transform(df_X_test)
            df_X_test_imputed = df_X_test.copy()
            df_X_test_imputed[ordinal_features+continuous_features] = X_test_imputed
        else : 
            print("No NaN in test data -> Keep as it is. ")
            df_X_test_imputed = df_X_test

    # Demographics adjustment for y
    demographic_adjustment_y = DemographicAdjustmentTransformer()
    y_train_adjusted = demographic_adjustment_y.fit_transform(df_y_train, c_train)
    y_test_adjusted = demographic_adjustment_y.transform(df_y_test, c_test)

    # Demographics adjustment for X
    demographic_adjustment_X = DemographicAdjustmentTransformer(categorical_columns=ordinal_features)
    X_train_adjusted = demographic_adjustment_X.fit_transform(df_X_train_imputed, c_train)
    X_test_adjusted = demographic_adjustment_X.transform(df_X_test_imputed, c_test)

    print(X_train_adjusted)

    # Standardize only continuous features
    scaler = StandardScaler()

    X_train_adjusted[continuous_features] = scaler.fit_transform(X_train_adjusted[continuous_features])
    X_test_adjusted[continuous_features] = scaler.transform(X_test_adjusted[continuous_features])

    # Perform prediction and save variables
    start = time.time()

    if isinstance(model, TabNetRegressor): 
        X_train_adjusted = X_train_adjusted.values
        y_train_adjusted = y_train_adjusted.values

        X_test_adjusted = X_test_adjusted.values
        #y_test_adjusted = y_test_adjusted.values
    
    model.fit(X_train_adjusted, y_train_adjusted) 
    end = time.time()

    predict_model_time = end - start

    y_pred_adjusted = model.predict(X_test_adjusted)

    y_pred_adjusted = pd.DataFrame(y_pred_adjusted, columns=y_test_adjusted.columns)

    # Metrics computed in original space
    y_pred = demographic_adjustment_y.inverse_transform(y_pred_adjusted, c_test)

    params = {
        "ordinal_imputer": name_ordinal_imputer, 
        "continuous_imputer": name_continuous_imputer, 
        "model": name_model, "train_shape" : X_train_adjusted.shape, 
        "test_shape": X_test_adjusted.shape
    }
    
    if df_X_test.shape[0] != 1: 

        # Metrics computed in adjusted space
        mse_score_adj, mae_score_ajd, r2_adj, explained_variance_adj, corr_adj = compute_all_metrics(y_test_adjusted.values, y_pred_adjusted)

        results_adj = {
            "mse_score": mse_score_adj, 
            "mae_score":mae_score_ajd, 
            "r2":r2_adj, 
            "explained_variance":explained_variance_adj, 
            "corr":corr_adj, 
        }

        mse_score, mae_score, r2, explained_variance, corr = compute_all_metrics(df_y_test.values, y_pred)

        results_org = {
            "mse_score": mse_score, 
            "mae_score": mae_score, 
            "r2": r2, 
            "explained_variance": explained_variance, 
            "corr": corr, 
        }

    else : 
        print("Saving predictions in dict!")
        results_adj = {
            "y_pred": y_pred_adjusted.values, 
            "y_test": y_test_adjusted.values,
        }

        results_org = {
            "y_pred": y_pred.values, 
            "y_test": df_y_test.values,
        }


    dict_results = {
        "params": params, 
        "imputation_time": impute_model_time,
        "fitting_time": predict_model_time, 
        "results_adj": results_adj, 
        "results_org": results_org
        }

    return dict_results


In [7]:
def train_imputer_model(
    df_X_train, df_X_test, df_y_train, df_y_test,
    c_train, c_test,
    ordinal_model, name_ordinal_imputer, 
    continuous_model, name_continuous_imputer, 
    model, name_model, 
    imputer_model=None, name_imputer=None, 
    separate_imputers=True,
    ordinal_features = ['APOE_epsilon2', 'APOE_epsilon3', 'APOE_epsilon4']
): 
    # Define which columns are ordinal and which are continuous
    continuous_features = [col for col in df_X_train.columns if col not in ordinal_features]

    # Check if a general imputer model (like MissForest or MICEForest) is provided
    if imputer_model is not None and name_imputer is not None and not separate_imputers:
        # If `imputer_model` can handle both categorical and continuous data types
        print(f"Using general imputer model: {name_imputer}")

        # Ensure that ordinal columns are marked as categorical
        df_X_train = df_X_train.copy()
        df_X_test = df_X_test.copy()
        
        for col in ordinal_features:
            df_X_train[col] = df_X_train[col].astype("category")
            df_X_test[col] = df_X_test[col].astype("category")

        # Create a pipeline with the general imputer
        pipeline = Pipeline(steps=[
            (name_imputer, imputer_model)
        ])

        # Fit and transform the entire dataset with the general imputer
        pipeline.fit(df_X_train)
        X_train_imputed = pipeline.transform(df_X_train)
        X_test_imputed = pipeline.transform(df_X_test)

        # Convert transformed output back to DataFrame with original column names
        df_X_train_imputed = pd.DataFrame(X_train_imputed, columns=df_X_train.columns)
        df_X_test_imputed = pd.DataFrame(X_test_imputed, columns=df_X_test.columns)

    else:
        # Separate imputers for ordinal and continuous data
        print("Using separate imputers for ordinal and continuous data.")

        df_X_train = df_X_train.copy()
        df_X_test = df_X_test.copy()

        # Continuous Imputation Transformer (Example: SimpleImputer)
        continuous_imputer = Pipeline([
            (name_continuous_imputer, continuous_model),
        ])

        # Ordinal Imputation Transformer (Example: KNN Imputer)
        ordinal_imputer = Pipeline([
            (name_ordinal_imputer, ordinal_model)
        ])

        # Create a ColumnTransformer to apply the appropriate imputer to each type of variable
        preprocessor = ColumnTransformer(
            transformers=[
                ('ordinal', ordinal_imputer, ordinal_features),
                ('continuous', continuous_imputer, continuous_features)
            ],
            remainder='passthrough'
        )

        # Create the pipeline
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor)
        ])

         # Separate imputers for ordinal and continuous data
        # Fit and transform, then convert back to DataFrame with original column names

        if df_X_train.isna().any().any():
                
            start = time.time()
            pipeline.fit(df_X_train)
            end = time.time()

            impute_model_time = end - start

            X_train_imputed = pipeline.transform(df_X_train)
            df_X_train_imputed = df_X_train.copy()
            df_X_train_imputed[ordinal_features+continuous_features] = X_train_imputed
        else :
            print("No NaN in train data -> Keep as it is. ")
            df_X_train_imputed = df_X_train
            
            impute_model_time = None

        # Transform the test set
        if df_X_test.isna().any().any(): 
            X_test_imputed = pipeline.transform(df_X_test)
            df_X_test_imputed = df_X_test.copy()
            df_X_test_imputed[ordinal_features+continuous_features] = X_test_imputed
        else : 
            print("No NaN in test data -> Keep as it is. ")
            df_X_test_imputed = df_X_test

    # Demographics adjustment for y
    demographic_adjustment_y = DemographicAdjustmentTransformer()
    y_train_adjusted = demographic_adjustment_y.fit_transform(df_y_train, c_train)
    y_test_adjusted = demographic_adjustment_y.transform(df_y_test, c_test)

    # Demographics adjustment for X
    demographic_adjustment_X = DemographicAdjustmentTransformer(categorical_columns=ordinal_features)
    X_train_adjusted = demographic_adjustment_X.fit_transform(df_X_train_imputed, c_train)
    X_test_adjusted = demographic_adjustment_X.transform(df_X_test_imputed, c_test)

    print(X_train_adjusted)

    # Standardize only continuous features
    scaler = StandardScaler()

    X_train_adjusted[continuous_features] = scaler.fit_transform(X_train_adjusted[continuous_features])
    X_test_adjusted[continuous_features] = scaler.transform(X_test_adjusted[continuous_features])

    # Perform prediction and save variables
    start = time.time()

    if isinstance(model, TabNetRegressor): 
        X_train_adjusted = X_train_adjusted.values
        y_train_adjusted = y_train_adjusted.values

        X_test_adjusted = X_test_adjusted.values
        #y_test_adjusted = y_test_adjusted.values
    
    model.fit(X_train_adjusted, y_train_adjusted) 
    end = time.time()

    predict_model_time = end - start

    y_pred_adjusted = model.predict(X_test_adjusted)

    y_pred_adjusted = pd.DataFrame(y_pred_adjusted, columns=y_test_adjusted.columns)

    # Metrics computed in original space
    y_pred = demographic_adjustment_y.inverse_transform(y_pred_adjusted, c_test)

    params = {
        "ordinal_imputer": name_ordinal_imputer, 
        "continuous_imputer": name_continuous_imputer, 
        "model": name_model, "train_shape" : X_train_adjusted.shape, 
        "test_shape": X_test_adjusted.shape
    }
    
    if df_X_test.shape[0] != 1: 

        # Metrics computed in adjusted space
        mse_score_adj, mae_score_ajd, r2_adj, explained_variance_adj, corr_adj = compute_all_metrics(y_test_adjusted.values, y_pred_adjusted)

        results_adj = {
            "mse_score": mse_score_adj, 
            "mae_score":mae_score_ajd, 
            "r2":r2_adj, 
            "explained_variance":explained_variance_adj, 
            "corr":corr_adj, 
        }

        mse_score, mae_score, r2, explained_variance, corr = compute_all_metrics(df_y_test.values, y_pred)

        results_org = {
            "mse_score": mse_score, 
            "mae_score": mae_score, 
            "r2": r2, 
            "explained_variance": explained_variance, 
            "corr": corr, 
        }

    else : 
        print("Saving predictions in dict!")
        results_adj = {
            "y_pred": y_pred_adjusted.values, 
            "y_test": y_test_adjusted.values,
        }

        results_org = {
            "y_pred": y_pred.values, 
            "y_test": df_y_test.values,
        }


    dict_results = {
        "params": params, 
        "imputation_time": impute_model_time,
        "fitting_time": predict_model_time, 
        "results_adj": results_adj, 
        "results_org": results_org
        }

    return dict_results


# Hyperparameter search 

## Linear models

In [9]:
enet = MultiTaskElasticNet(max_iter=10000)

param_grid_enet = {
    'alpha': [1e-4, 1e-3, 0.01, 0.1, 1.0, 10.0, 100.0],
    'l1_ratio': [0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 1.0]
}

grid_search_enet = GridSearchCV(
    estimator=enet,
    param_grid=param_grid_enet,
    cv=5,
    scoring='r2',
    n_jobs=-1
)

grid_search_enet.fit(X_train_preprocessed, y_train_adjusted)
best_enet = grid_search_enet.best_estimator_

print("Best ElasticNet Parameters:", grid_search_enet.best_params_)

Best ElasticNet Parameters: {'alpha': 0.01, 'l1_ratio': 0.01}


In [10]:
lasso = MultiTaskLasso(max_iter=10000)

param_grid_lasso = {
    'alpha': [1e-4, 1e-3, 1e-2, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]
}

grid_search_lasso = GridSearchCV(
    estimator=lasso,
    param_grid=param_grid_lasso,
    cv=5,
    scoring='r2',
    n_jobs=-1
)

grid_search_lasso.fit(X_train_preprocessed, y_train_adjusted)
best_lasso = grid_search_lasso.best_estimator_

print("Best Lasso Parameters:", grid_search_lasso.best_params_)


Best Lasso Parameters: {'alpha': 0.001}


## Partial Least Squares Regression

In [21]:
# Initialize the estimator
pls = PLSRegression()

# Define the parameter grid
param_grid_pls = {
    'n_components': list(range(1, min(X_train_preprocessed.shape[1], y_train_adjusted.shape[1]) + 1))
}

# Set up the grid search
grid_search_pls = GridSearchCV(
    estimator=pls,
    param_grid=param_grid_pls,
    cv=5,  # Use cross-validation
    scoring='r2',  # Can be changed to 'neg_mean_squared_error' or others
    n_jobs=-1,
    verbose=1
)

# Fit the model
grid_search_pls.fit(X_train_preprocessed, y_train_adjusted)

# Extract the best model
best_pls = grid_search_pls.best_estimator_
print("Best PLS Parameters:", grid_search_pls.best_params_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best PLS Parameters: {'n_components': 4}


## XGBoost

In [28]:
from sklearn.base import BaseEstimator, RegressorMixin
import xgboost as xgb

class XGBoostRegressor(BaseEstimator, RegressorMixin):
    def __init__(self,
                 num_boost_round=100,
                 max_depth=3,
                 lambda_=1.0,
                 learning_rate=0.1,
                 subsample=1.0,
                 colsample_bytree=1.0,
                 alpha=0.0,
                 tree_method="hist",
                 custom_obj=True,
                 custom_metric=True):
        
        # Expose all as attributes
        self.num_boost_round = num_boost_round
        self.max_depth = max_depth
        self.lambda_ = lambda_
        self.learning_rate = learning_rate
        self.subsample = subsample
        self.colsample_bytree = colsample_bytree
        self.alpha = alpha
        self.tree_method = tree_method
        self.custom_obj = custom_obj
        self.custom_metric = custom_metric
        
        self.model = None

    def get_params(self, deep=True):
        # Return all parameters
        return {
            'num_boost_round': self.num_boost_round,
            'max_depth': self.max_depth,
            'lambda_': self.lambda_,
            'learning_rate': self.learning_rate,
            'subsample': self.subsample,
            'colsample_bytree': self.colsample_bytree,
            'alpha': self.alpha,
            'tree_method': self.tree_method,
            'custom_obj': self.custom_obj,
            'custom_metric': self.custom_metric
        }

    def set_params(self, **params):
        for key, val in params.items():
            setattr(self, key, val)
        return self

    def fit(self, X, y):
        dtrain = xgb.DMatrix(data=X, label=y)
        params = {
            'max_depth': self.max_depth,
            'lambda': self.lambda_,
            'learning_rate': self.learning_rate,
            'subsample': self.subsample,
            'colsample_bytree': self.colsample_bytree,
            'alpha': self.alpha,
            'tree_method': self.tree_method,
            'num_target': y.shape[1]
        }
        obj_fn = squared_log if self.custom_obj else 'reg:squarederror'
        metric_fn = rmse if self.custom_metric else None

        self.model = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=self.num_boost_round,
            obj=obj_fn,
            custom_metric=metric_fn
        )
        return self

    def predict(self, X):
        dtest = xgb.DMatrix(data=X)
        return self.model.predict(dtest)


In [None]:
param_dist = {
    'learning_rate': uniform(0.01, 0.29),
    'max_depth': randint(3, 10),
    'min_child_weight': randint(1, 10),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5)
}

rand_search_xgb = RandomizedSearchCV(
    estimator=XGBoostRegressor(tree_method="hist", custom_obj=True, custom_metric=True),
    param_distributions=param_dist,
    n_iter=50,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

rand_search_xgb.fit(X_train_preprocessed, y_train_adjusted)
print("Best XGBoost Params:", rand_search_xgb.best_params_)
best_xgb = rand_search_xgb.best_estimator_

Fitting 5 folds for each of 50 candidates, totalling 250 fits




Best XGBoost Params: {'colsample_bytree': 0.5079831261101071, 'learning_rate': 0.0769592094304232, 'max_depth': 6, 'min_child_weight': 7, 'subsample': 0.8049983288913105}


In [32]:
print("Best XGBoost Params:", rand_search_xgb.best_params_)
best_xgb = rand_search_xgb.best_estimator_

Best XGBoost Params: {'colsample_bytree': 0.5079831261101071, 'learning_rate': 0.0769592094304232, 'max_depth': 6, 'min_child_weight': 7, 'subsample': 0.8049983288913105}


# Pytorch Tabular

In [None]:
targets=["ADNI_MEM", "ADNI_EF", "ADNI_VS", "ADNI_LAN"]

data_config = DataConfig(
    target=targets,
    continuous_cols=continuous_features,
    categorical_cols=ordinal_features
)

trainer_config = TrainerConfig(
    batch_size=32,
    max_epochs=100,
    auto_lr_find=True,
    early_stopping="valid_loss",          # Monitor validation loss
    early_stopping_mode="min",           # Stop when it stops decreasing
    early_stopping_patience=5,           # Wait up to 5 epochs with no improvement
    early_stopping_min_delta=0.001,      # Minimum change to qualify as an improvement
    checkpoints="valid_loss",            # Save best model on val_loss
    checkpoints_mode="min",
    checkpoints_save_top_k=1,
    load_best=True,                      # Load best model post-training
    progress_bar="simple",               # Basic progress bar (or "none")
    trainer_kwargs=dict(enable_model_summary=False),
    accelerator="cpu"                    # Change to "gpu" if available
)

optimizer_config = OptimizerConfig()

TypeError: DataConfig.__init__() got an unexpected keyword argument 'batch_size'

In [9]:
from pytorch_tabular.models.common.heads import LinearHeadConfig
from pytorch_tabular import MODEL_SWEEP_PRESETS

print(list(MODEL_SWEEP_PRESETS.keys()))

head_config = LinearHeadConfig(
    layers="",
    dropout=0.1,
    initialization=(  # No additional layer in head, just a mapping layer to output_dim
        "kaiming"
    ),
).__dict__  # Convert to dict to pass to the model config (OmegaConf doesn't accept objects)

['lite', 'standard', 'full', 'high_memory']


In [21]:
print(list(MODEL_SWEEP_PRESETS["standard"]))

[('CategoryEmbeddingModelConfig', {'layers': '256-128-64'}), ('CategoryEmbeddingModelConfig', {'layers': '512-128-64'}), ('GANDALFConfig', {'gflu_stages': 6}), ('GANDALFConfig', {'gflu_stages': 15}), ('TabNetModelConfig', {'n_d': 32, 'n_a': 32, 'n_steps': 3, 'gamma': 1.5, 'n_independent': 1, 'n_shared': 2}), ('TabNetModelConfig', {'n_d': 32, 'n_a': 32, 'n_steps': 5, 'gamma': 1.5, 'n_independent': 2, 'n_shared': 3}), ('FTTransformerConfig', {'num_heads': 4, 'num_attn_blocks': 4})]


In [22]:
df_Xy_preprocessed_adjusted = pd.DataFrame(np.concatenate([X_train_adjusted, y_train_adjusted], axis=1), columns = ordinal_features+continuous_features+targets)

In [23]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_Xy_preprocessed_adjusted, random_state=42, test_size=0.2)
print(f"Train Shape: {train.shape} | Test Shape: {test.shape}")

Train Shape: (2304, 280) | Test Shape: (577, 280)


In [24]:
metrics = [
    #"r2_score",
    "mean_absolute_error",
    "explained_variance",
]

# Provide an empty dict for each metric (no extra params needed):
metrics_params = [{}, {}]

# For regression, all metrics are computed on predictions themselves:
metrics_prob_input = [False] * len(metrics)

from omegaconf import DictConfig
torch.serialization.safe_globals([DictConfig])

# Filtering out the warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    sweep_df, best_model = model_sweep(
        task="regression",  # One of "classification", "regression"
        train=train,
        test=test,
        data_config=data_config,
        optimizer_config=optimizer_config,
        trainer_config=trainer_config,
        model_list="full",
        common_model_args=dict(head="LinearHead", head_config=head_config),
        metrics = metrics,
        metrics_params=metrics_params,
        metrics_prob_input=metrics_prob_input,
        rank_metric=("mean_absolute_error", "higher_is_better"),
        progress_bar=True,
        verbose=False,
        suppress_lightning_logger=True,
    )

Output()

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

In [25]:
sweep_df

Unnamed: 0,model,# Params,epochs,test_loss_0,test_loss_1,test_loss_2,test_loss_3,test_loss,test_mean_absolute_error_0,test_mean_absolute_error_1,...,test_mean_absolute_error_3,test_mean_absolute_error,test_explained_variance_0,test_explained_variance_1,test_explained_variance_2,test_explained_variance_3,test_explained_variance,time_taken,time_taken_per_epoch,params
6,TabNetModel,227 T,5,0.897485,1.073166,0.678909,0.881682,3.531243,0.782892,0.807683,...,0.743692,2.992384,0.002445,0.001495,0.000981,0.001422,0.006342,33.088309,6.617662,"{'task': 'regression', 'head': 'LinearHead', '..."
0,AutoIntModel,121 T,85,0.567992,0.729492,0.633248,0.626819,2.55755,0.607697,0.660484,...,0.629624,2.534326,0.367111,0.324409,0.028654,0.28939,1.009563,700.243122,8.238154,"{'task': 'regression', 'head': 'LinearHead', '..."
4,GANDALFModel,6 M,100,0.522728,0.706339,0.603901,0.59732,2.430287,0.577457,0.658918,...,0.610478,2.468199,0.425688,0.343637,0.103546,0.327892,1.200763,573.442258,5.734423,"{'task': 'regression', 'head': 'LinearHead', '..."
7,TabTransformerModel,400 T,13,0.380481,0.766874,0.606936,0.507441,2.261732,0.47753,0.670824,...,0.556025,2.317068,0.584447,0.366383,0.095381,0.399458,1.445669,62.998656,4.84605,"{'task': 'regression', 'head': 'LinearHead', '..."
3,FTTransformerModel,416 T,20,0.427512,0.608372,0.574697,0.530267,2.140848,0.507553,0.613245,...,0.573561,2.309888,0.525614,0.432071,0.133033,0.392044,1.482762,4357.219138,217.860957,"{'task': 'regression', 'head': 'LinearHead', '..."
1,CategoryEmbeddingModel,263 T,9,0.355338,0.61213,0.602721,0.433786,2.003975,0.46319,0.613986,...,0.515381,2.206153,0.60128,0.441098,0.099351,0.485393,1.627123,21.759632,2.417737,"{'task': 'regression', 'head': 'LinearHead', '..."
2,DANetModel,2 M,16,0.354974,0.540967,0.669353,0.441169,2.006463,0.4579,0.564277,...,0.517689,2.191942,0.595836,0.488291,-0.013647,0.489327,1.559807,151.719178,9.482449,"{'task': 'regression', 'head': 'LinearHead', '..."
5,GatedAdditiveTreeEnsembleModel,7 M,11,0.32399,0.542354,0.560329,0.408959,1.835631,0.437651,0.571564,...,0.49284,2.101103,0.625733,0.485292,0.153444,0.506565,1.771033,346.253147,31.477559,"{'task': 'regression', 'head': 'LinearHead', '..."
