In [1]:
import torch
from sklearn.datasets import fetch_openml

from neural_blueprints.architectures import MLP
from neural_blueprints.config.architectures import MLPConfig
from neural_blueprints.config.utils import TrainerConfig
from neural_blueprints.config.components.composite.projections.input import TabularInputProjectionConfig
from neural_blueprints.config.components.composite.projections.output import TabularOutputProjectionConfig
from neural_blueprints.utils import Trainer, infer_types
from neural_blueprints.preprocess import TabularPreprocessor
from neural_blueprints.datasets import MaskedTabularDataset, TabularSingleLabelDataset

import logging
logging.basicConfig(
    level=logging.DEBUG,  # or DEBUG if you want even more detail
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)

In [2]:
data = fetch_openml(name="adult", version=2, as_frame=True)
X = data.data
y = data.target

data = X.copy()
data['income'] = y

dtypes = infer_types(data)
data = data.astype(dtypes)
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K


In [3]:
preprocessor = TabularPreprocessor()
data, discrete_features, continuous_features = preprocessor.run(data)
data.head()

2025-12-28 15:19:21,991 - neural_blueprints.preprocess.tabular_preprocess - INFO - Identified 10 discrete features: ['workclass', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']
2025-12-28 15:19:21,991 - neural_blueprints.preprocess.tabular_preprocess - INFO - Identified 5 continuous features: ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,0.109589,4,0.145129,2,14,5,7,4,3,2,0.0,0.0,0.397959,39,1
1,0.287671,4,0.052451,12,16,3,5,1,5,2,0.0,0.0,0.5,39,1
2,0.150685,2,0.219649,8,4,3,11,1,5,2,0.0,0.0,0.397959,39,2
3,0.369863,4,0.100153,16,2,3,7,1,3,2,0.076881,0.0,0.397959,39,2
4,0.013699,0,0.061708,16,2,5,0,4,5,1,0.0,0.0,0.295918,39,1


### Income Inference Accuracy

In [4]:
dataset = TabularSingleLabelDataset(
    data=data,
    label_column='income',              # Specify the label column for single-label classification
    discrete_features=discrete_features,
    continuous_features=continuous_features
)

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

In [5]:
# Define model configuration
mlp_config = MLPConfig(
    hidden_dims=[128, 64, 32, 16],
    normalization="batchnorm1d",
    activation='gelu',
    dropout_p=0.2,
    final_activation=None,
    input_projection=TabularInputProjectionConfig(
        cardinalities=dataset.cardinalities,
        hidden_dims=[64, 32],
        output_dim=[len(dataset.cardinalities)*16],
        normalization="batchnorm1d",
        activation="gelu",
        dropout_p=0.2
    ),
    output_projection=TabularOutputProjectionConfig(
        input_cardinalities=dataset.cardinalities,
        output_cardinalities=[2],
        input_dim=[len(dataset.cardinalities)*16],
        hidden_dims=[len(dataset.cardinalities)*8],
        normalization="batchnorm1d",
        activation="gelu",
        dropout_p=0.2
    )
)

# Initialize model
model = MLP(mlp_config)
model.blueprint()

2025-12-28 15:19:22,174 - neural_blueprints.architectures.mlp - INFO - Using input projection: TabularInputProjection
2025-12-28 15:19:22,175 - neural_blueprints.architectures.mlp - INFO - Using output projection: TabularOutputProjection


Sequential(
  (0): TabularInputProjection(
    (input_projections): ModuleList(
      (0): FeedForwardNetwork(
        (network): Sequential(
          (0): DenseLayer(
            (layer): Sequential(
              (0): Linear(in_features=1, out_features=64, bias=True)
              (1): NormalizationLayer(
                (network): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
              )
              (2): GELU(approximate='none')
              (3): DropoutLayer(
                (dropout): Dropout(p=0.2, inplace=False)
              )
            )
          )
          (1): DenseLayer(
            (layer): Sequential(
              (0): Linear(in_features=64, out_features=32, bias=True)
              (1): NormalizationLayer(
                (network): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
              )
              (2): GELU(approximate='none')
              (3): DropoutLayer(
                (dropo

MLPConfig(input_dim=224, hidden_dims=[128, 64, 32, 16], output_dim=224, normalization='batchnorm1d', activation='gelu', dropout_p=0.2, final_activation=None, input_projection=TabularInputProjectionConfig(cardinalities=[1, 9, 1, 16, 16, 7, 15, 6, 5, 2, 1, 1, 1, 42], hidden_dims=[64, 32], output_dim=[224], dropout_p=0.2, normalization='batchnorm1d', activation='gelu'), output_projection=TabularOutputProjectionConfig(input_cardinalities=[1, 9, 1, 16, 16, 7, 15, 6, 5, 2, 1, 1, 1, 42], output_cardinalities=[2], input_dim=[224], hidden_dims=[112], activation='gelu', normalization='batchnorm1d', dropout_p=0.2))

In [6]:
trainer = Trainer(
    config=TrainerConfig(
        optimizer="adam",
        criterion="cross_entropy",
        learning_rate=1e-3,
        weight_decay=1e-5,
        batch_size=128,
        early_stopping_patience=5,
        save_weights_path="./models/mlp_adult.pth"
    ),
    model= model
)

# Train the model
trainer.train(train_dataset, val_dataset, epochs=5)

2025-12-28 15:19:22,528 - neural_blueprints.utils.trainer - INFO - Trainer initialized on device: cpu


Directory ./models already exists. Existing weights are overwritten.


Training Epochs:  20%|██        | 1/5 [00:02<00:10,  2.53s/epoch]

Epoch 1/5, Training Loss: 0.7370, Validation Loss: 0.6893


Training Epochs:  40%|████      | 2/5 [00:05<00:07,  2.58s/epoch]

Epoch 2/5, Training Loss: 0.6880, Validation Loss: 0.6855


Training Epochs:  60%|██████    | 3/5 [00:07<00:05,  2.55s/epoch]

Epoch 3/5, Training Loss: 0.6838, Validation Loss: 0.6877


Training Epochs:  80%|████████  | 4/5 [00:10<00:02,  2.58s/epoch]

Epoch 4/5, Training Loss: 0.6826, Validation Loss: 0.6816


Training Epochs: 100%|██████████| 5/5 [00:12<00:00,  2.56s/epoch]
2025-12-28 15:19:35,326 - neural_blueprints.utils.trainer - INFO - Training completed in 12.80 seconds.
2025-12-28 15:19:35,327 - neural_blueprints.utils.trainer - INFO - Best validation loss: 6.7960e-01


Epoch 5/5, Training Loss: 0.6813, Validation Loss: 0.6796


In [7]:
trainer.predict(val_dataset)

2025-12-28 15:19:35,590 - neural_blueprints.utils.trainer - INFO - Inference completed in 0.04 seconds.


Predictions: tensor([1, 1, 1, 1, 1]), 
 Ground Truth: tensor([1, 1, 1, 1, 1])
Prediction Accuracy: 0.8479


np.float64(0.8479017400204708)

### Masked Dataset Inference Accuracy

In [8]:
dataset = MaskedTabularDataset(
    data = data,
    discrete_features = discrete_features,
    continuous_features = continuous_features,
    mask_prob=0.35
)

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

In [9]:
# Define model configuration
mlp_config = MLPConfig(
    hidden_dims=[128, 64, 32, 16],
    normalization="batchnorm1d",
    activation='gelu',
    dropout_p=0.2,
    final_activation="relu",
    input_projection=TabularInputProjectionConfig(
        cardinalities=dataset.cardinalities,
        hidden_dims=[64, 32],
        output_dim=[len(dataset.cardinalities) * 16],
        normalization="batchnorm1d",
        activation="gelu",
        dropout_p=0.2
    ),
    output_projection=TabularOutputProjectionConfig(
        input_cardinalities=dataset.cardinalities,
        input_dim=[len(dataset.cardinalities)*16],
        hidden_dims=[8],
        activation="gelu",
        normalization="batchnorm1d",
        dropout_p=0.2
    )
)

# Initialize model
model = MLP(mlp_config)
model.blueprint()

2025-12-28 15:19:35,616 - neural_blueprints.architectures.mlp - INFO - Using input projection: TabularInputProjection


2025-12-28 15:19:35,623 - neural_blueprints.architectures.mlp - INFO - Using output projection: TabularOutputProjection


Sequential(
  (0): TabularInputProjection(
    (input_projections): ModuleList(
      (0): FeedForwardNetwork(
        (network): Sequential(
          (0): DenseLayer(
            (layer): Sequential(
              (0): Linear(in_features=1, out_features=64, bias=True)
              (1): NormalizationLayer(
                (network): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
              )
              (2): GELU(approximate='none')
              (3): DropoutLayer(
                (dropout): Dropout(p=0.2, inplace=False)
              )
            )
          )
          (1): DenseLayer(
            (layer): Sequential(
              (0): Linear(in_features=64, out_features=32, bias=True)
              (1): NormalizationLayer(
                (network): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
              )
              (2): GELU(approximate='none')
              (3): DropoutLayer(
                (dropo

MLPConfig(input_dim=240, hidden_dims=[128, 64, 32, 16], output_dim=240, normalization='batchnorm1d', activation='gelu', dropout_p=0.2, final_activation='relu', input_projection=TabularInputProjectionConfig(cardinalities=[1, 9, 1, 16, 16, 7, 15, 6, 5, 2, 1, 1, 1, 42, 2], hidden_dims=[64, 32], output_dim=[240], dropout_p=0.2, normalization='batchnorm1d', activation='gelu'), output_projection=TabularOutputProjectionConfig(input_cardinalities=[1, 9, 1, 16, 16, 7, 15, 6, 5, 2, 1, 1, 1, 42, 2], output_cardinalities=None, input_dim=[240], hidden_dims=[8], activation='gelu', normalization='batchnorm1d', dropout_p=0.2))

In [10]:
trainer = Trainer(
    config=TrainerConfig(
        optimizer="adam",
        criterion="masked_reconstruction",
        learning_rate=1e-3,
        weight_decay=1e-5,
        batch_size=128,
        early_stopping_patience=5,
        save_weights_path="./models/mlp_adult.pth"
    ),
    model= model
)

# Train the model
trainer.train(train_dataset, val_dataset, epochs=5)

2025-12-28 15:19:35,639 - neural_blueprints.utils.trainer - INFO - Trainer initialized on device: cpu


Directory ./models already exists. Existing weights are overwritten.


Training Epochs:  20%|██        | 1/5 [00:03<00:15,  3.87s/epoch]

Epoch 1/5, Training Loss: 7.3220, Validation Loss: 6.5526


Training Epochs:  40%|████      | 2/5 [00:07<00:11,  3.83s/epoch]

Epoch 2/5, Training Loss: 6.4846, Validation Loss: 6.2753


Training Epochs:  60%|██████    | 3/5 [00:11<00:07,  3.89s/epoch]

Epoch 3/5, Training Loss: 6.3044, Validation Loss: 6.1875


Training Epochs:  80%|████████  | 4/5 [00:15<00:03,  3.95s/epoch]

Epoch 4/5, Training Loss: 6.2313, Validation Loss: 6.1493


Training Epochs: 100%|██████████| 5/5 [00:19<00:00,  3.91s/epoch]
2025-12-28 15:19:55,184 - neural_blueprints.utils.trainer - INFO - Training completed in 19.54 seconds.
2025-12-28 15:19:55,185 - neural_blueprints.utils.trainer - INFO - Best validation loss: 6.1350e+00


Epoch 5/5, Training Loss: 6.1969, Validation Loss: 6.1350


In [11]:
trainer.predict(val_dataset)

2025-12-28 15:19:55,299 - neural_blueprints.utils.trainer - INFO - Inference completed in 0.03 seconds.


Feature Column 0:
Predicted attribute values: [0.37699813 0.2375072  0.24207658 0.25268227 0.24441426]
True attribute values: [0.20547946 0.28767124 0.10958904 0.30136988 0.06849315]
Accuracy: 0.1948

Feature Column 1:
Predicted attribute values: [4 4 4 4 4]
True attribute values: [4. 4. 6. 4. 4.]
Accuracy: 0.6880

Feature Column 2:
Predicted attribute values: [0.10805448 0.11114298 0.1089888  0.10163829 0.10179372]
True attribute values: [0.28141654 0.01484999 0.12027954 0.08305917 0.08640464]
Accuracy: 0.5850

Feature Column 3:
Predicted attribute values: [12 12 12 12 13]
True attribute values: [12. 16. 16. 12. 13.]
Accuracy: 0.3676

Feature Column 4:
Predicted attribute values: [16 16 16 16 16]
True attribute values: [2. 2. 2. 2. 2.]
Accuracy: 0.3200

Feature Column 5:
Predicted attribute values: [5 5 5 5 3]
True attribute values: [7. 5. 5. 5. 3.]
Accuracy: 0.7024

Feature Column 6:
Predicted attribute values: [ 1 10  8  1  1]
True attribute values: [ 1.  3. 10. 10. 12.]
Accuracy: 0

{'avg_discrete_accuracy': np.float64(0.6064907879899555),
 'avg_continuous_accuracy': np.float64(0.6512516249552436),
 'overall_avg_accuracy': np.float64(0.6214110669783849)}