In [1]:
import torch
from sklearn.datasets import fetch_openml

from neural_blueprints.architectures import MLP
from neural_blueprints.config.architectures import MLPConfig
from neural_blueprints.config.utils import TrainerConfig
from neural_blueprints.config.components.composite.projections import TabularProjectionConfig
from neural_blueprints.utils import Trainer, infer_types
from neural_blueprints.preprocess import TabularPreprocessor
from neural_blueprints.datasets import MaskedTabularDataset, TabularLabelDataset

import logging
logging.basicConfig(
    level=logging.DEBUG,  # or DEBUG if you want even more detail
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)

In [2]:
data = fetch_openml(name="adult", version=2, as_frame=True)
X = data.data
y = data.target

data = X.copy()
data['income'] = y

dtypes = infer_types(data)
data = data.astype(dtypes)
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K


In [3]:
preprocessor = TabularPreprocessor()
data, discrete_features, continuous_features = preprocessor.run(data)
data.head()

2026-01-22 16:24:20,549 - neural_blueprints.preprocess.tabular_preprocess - INFO - Identified 10 discrete features: ['workclass', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']
2026-01-22 16:24:20,549 - neural_blueprints.preprocess.tabular_preprocess - INFO - Identified 5 continuous features: ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']
2026-01-22 16:24:20,550 - neural_blueprints.preprocess.tabular_preprocess - INFO - Discrete column 'workclass' has 2799/5.73% NaN values; these will be encoded as 0.
2026-01-22 16:24:20,607 - neural_blueprints.preprocess.tabular_preprocess - INFO - Discrete column 'occupation' has 2809/5.75% NaN values; these will be encoded as 0.
2026-01-22 16:24:20,661 - neural_blueprints.preprocess.tabular_preprocess - INFO - Discrete column 'native-country' has 857/1.75% NaN values; these will be encoded as 0.


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,0.109589,4,0.145129,2,14,5,7,4,3,2,0.0,0.0,0.397959,39,1
1,0.287671,4,0.052451,12,16,3,5,1,5,2,0.0,0.0,0.5,39,1
2,0.150685,2,0.219649,8,4,3,11,1,5,2,0.0,0.0,0.397959,39,2
3,0.369863,4,0.100153,16,2,3,7,1,3,2,0.076881,0.0,0.397959,39,2
4,0.013699,0,0.061708,16,2,5,0,4,5,1,0.0,0.0,0.295918,39,1


### Income Inference Accuracy

In [4]:
dataset = TabularLabelDataset(
    data=data,
    label_columns=['income'],              # Specify the label column for single-label classification
)

train_dataset, val_dataset = dataset.random_split([0.9, 0.1])

In [5]:
# Define model configuration
mlp_config = MLPConfig(
    input_spec=(len(dataset.cardinalities),),
    output_spec=(2,),
    hidden_dims=[256, 128, 128, 64, 64],
    normalization="layernorm",
    activation='gelu',
    dropout_p=0.2,
    final_activation=None,
    input_projection=TabularProjectionConfig(
        input_cardinalities=dataset.cardinalities,
        hidden_dims=[64, 32],
        projection_dim=16,
        output_dim=[len(dataset.cardinalities)*16]
    ),
    output_projection=TabularProjectionConfig(
        input_dim=[64],
        hidden_dims=[32, 16],
        output_cardinalities=[2],
    )
)

# Initialize model
model = MLP(mlp_config)
model.blueprint(batch_size=256, with_graph=False)

2026-01-22 16:24:20,733 - neural_blueprints.architectures.mlp - INFO - Using input projection: TabularInputProjection
2026-01-22 16:24:20,735 - neural_blueprints.architectures.mlp - INFO - Using output projection: TabularOutputProjection


Layer (type:depth-idx)                                                           Output Shape              Param #
MLP                                                                              [256, 3]                  --
├─Sequential: 1-3                                                                --                        (recursive)
│    └─TabularInputProjection: 2-1                                               [256, 224]                --
│    │    └─ModuleList: 3-1                                                      --                        86,560
├─Sequential: 1-2                                                                [256, 64]                 --
│    └─FeedForwardNetwork: 2-2                                                   [256, 64]                 --
│    │    └─Sequential: 3-2                                                      [256, 64]                 124,864
├─Sequential: 1-3                                                                --              

In [6]:
trainer = Trainer(
    config=TrainerConfig(
        optimizer="adam",
        criterion="cross_entropy",
        learning_rate=1e-3,
        weight_decay=1e-5,
        batch_size=256,
        early_stopping_patience=5
    ),
    model= model
)

# Train the model
trainer.train(train_dataset, val_dataset, epochs=5)
trainer.predict(val_dataset)

2026-01-22 16:24:20,864 - neural_blueprints.utils.trainer - INFO - Trainer initialized on device: cpu
Training Epochs:  20%|██        | 1/5 [00:01<00:07,  1.96s/epoch]

Epoch 1/5, Training Loss: 0.8685, Validation Loss: 0.7831


Training Epochs:  40%|████      | 2/5 [00:03<00:05,  1.95s/epoch]

Epoch 2/5, Training Loss: 0.7507, Validation Loss: 0.7164


Training Epochs:  60%|██████    | 3/5 [00:05<00:03,  1.94s/epoch]

Epoch 3/5, Training Loss: 0.7207, Validation Loss: 0.7164


Training Epochs:  80%|████████  | 4/5 [00:07<00:01,  1.95s/epoch]

Epoch 4/5, Training Loss: 0.7096, Validation Loss: 0.6979


Training Epochs: 100%|██████████| 5/5 [00:09<00:00,  1.95s/epoch]
2026-01-22 16:24:30,626 - neural_blueprints.utils.trainer - INFO - Training completed in 9.76 seconds.
2026-01-22 16:24:30,626 - neural_blueprints.utils.trainer - INFO - Best validation loss: 6.8949e-01


Epoch 5/5, Training Loss: 0.7015, Validation Loss: 0.6895


2026-01-22 16:24:30,811 - neural_blueprints.utils.trainer - INFO - Inference completed in 0.09 seconds.


Classification Accuracy: 0.8434


0.8433981576253838

### Masked Dataset Inference Accuracy

In [7]:
dataset = MaskedTabularDataset(
    data = data,
    mask_prob=0.35
)

train_dataset, val_dataset = dataset.random_split([0.9, 0.1])

In [8]:
# Define model configuration
mlp_config = MLPConfig(
    input_spec=(len(dataset.cardinalities),),
    output_spec=(len(dataset.cardinalities),),
    hidden_dims=[512, 256, 256, 128],
    normalization="layernorm",
    activation="gelu",
    dropout_p=0.2,
    final_activation=None,
    input_projection=TabularProjectionConfig(
        input_cardinalities=dataset.cardinalities,
        projection_dim=32,
        output_dim=[len(dataset.cardinalities) * 32]
    ),
    output_projection=TabularProjectionConfig(
        input_dim=[128],
        hidden_dims=[64, 64],
        output_cardinalities=dataset.cardinalities,
    )
)

# Initialize model
model = MLP(mlp_config)
model.blueprint(batch_size=256, with_graph=False)

2026-01-22 16:24:30,833 - neural_blueprints.architectures.mlp - INFO - Using input projection: TabularInputProjection
2026-01-22 16:24:30,837 - neural_blueprints.architectures.mlp - INFO - Using output projection: TabularOutputProjection


Layer (type:depth-idx)                                                           Output Shape              Param #
MLP                                                                              [256, 1]                  --
├─Sequential: 1-3                                                                --                        (recursive)
│    └─TabularInputProjection: 2-1                                               [256, 480]                --
│    │    └─ModuleList: 3-1                                                      --                        5,120
├─Sequential: 1-2                                                                [256, 128]                --
│    └─FeedForwardNetwork: 2-2                                                   [256, 128]                --
│    │    └─Sequential: 3-2                                                      [256, 128]                495,104
├─Sequential: 1-3                                                                --               

In [9]:
trainer = Trainer(
    config=TrainerConfig(
        optimizer="adam",
        criterion="masked_reconstruction",
        learning_rate=1e-3,
        weight_decay=1e-5,
        batch_size=256,
        early_stopping_patience=5,
        save_weights_path="./models/mlp_adult.pth"
    ),
    model= model
)

# Train the model
trainer.train(train_dataset, val_dataset, epochs=5)
trainer.predict(val_dataset)

2026-01-22 16:24:30,903 - neural_blueprints.utils.trainer - INFO - Trainer initialized on device: cpu


Directory ./models already exists. Existing weights are overwritten.


Training Epochs:  20%|██        | 1/5 [00:02<00:10,  2.62s/epoch]

Epoch 1/5, Training Loss: 6.6206, Validation Loss: 6.3806


Training Epochs:  40%|████      | 2/5 [00:05<00:07,  2.62s/epoch]

Epoch 2/5, Training Loss: 6.3246, Validation Loss: 6.2250


Training Epochs:  60%|██████    | 3/5 [00:07<00:05,  2.64s/epoch]

Epoch 3/5, Training Loss: 6.2487, Validation Loss: 6.2068


Training Epochs:  80%|████████  | 4/5 [00:10<00:02,  2.63s/epoch]

Epoch 4/5, Training Loss: 6.2323, Validation Loss: 6.2013


Training Epochs: 100%|██████████| 5/5 [00:13<00:00,  2.63s/epoch]
2026-01-22 16:24:44,038 - neural_blueprints.utils.trainer - INFO - Training completed in 13.13 seconds.
2026-01-22 16:24:44,039 - neural_blueprints.utils.trainer - INFO - Best validation loss: 6.1779e+00


Epoch 5/5, Training Loss: 6.2212, Validation Loss: 6.1779


2026-01-22 16:24:44,129 - neural_blueprints.utils.trainer - INFO - Inference completed in 0.04 seconds.


Feature Column 0:
Predicted attribute values: [0.17823073 0.33286345 0.18611129 0.34994414 0.36021316]
True attribute values: [0.04109589 0.21917808 0.1780822  0.5479452  0.16438356]
Accuracy: 0.2233

Feature Column 1:
Predicted attribute values: [4 4 4 4 4]
True attribute values: [4. 7. 6. 1. 4.]
Accuracy: 0.6777

Feature Column 2:
Predicted attribute values: [0.11610021 0.1133433  0.116993   0.12255617 0.11404969]
True attribute values: [0.06532036 0.09595464 0.16982576 0.03029602 0.11409532]
Accuracy: 0.5727

Feature Column 3:
Predicted attribute values: [12 12 12 12 12]
True attribute values: [12. 12.  8. 10.  6.]
Accuracy: 0.3417

Feature Column 4:
Predicted attribute values: [16 16 16 16 16]
True attribute values: [16. 16.  2. 16.  3.]
Accuracy: 0.3432

Feature Column 5:
Predicted attribute values: [5 5 3 5 5]
True attribute values: [3. 3. 3. 1. 5.]
Accuracy: 0.6941

Feature Column 6:
Predicted attribute values: [ 8 10  8  4  4]
True attribute values: [ 3.  4. 12.  1. 14.]
Accura

{'avg_discrete_accuracy': np.float64(0.6018855218129346),
 'avg_continuous_accuracy': np.float64(0.6193122359141212),
 'overall_avg_accuracy': np.float64(0.6076944265133302)}