# 📑 Tutorials for PyPOTS Imputation Models

## 📀 Preparing the **PhysioNet-2012** dataset for this tutorial

In [1]:
from benchpots.datasets import preprocess_physionet2012

# Load the dataset with artificially missing values
physionet2012_dataset = preprocess_physionet2012(subset="all", rate=0.1)

2025-02-25 21:33:28 [INFO]: You're using dataset physionet_2012, please cite it properly in your work. You can find its reference information at the below link: 
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/physionet_2012
2025-02-25 21:33:28 [INFO]: Dataset physionet_2012 has already been downloaded. Processing directly...
2025-02-25 21:33:28 [INFO]: Dataset physionet_2012 has already been cached. Loading from cache directly...
2025-02-25 21:33:28 [INFO]: Loaded successfully!
2025-02-25 21:33:41 [INFO]: 69068 values masked out in the val set as ground truth, take 9.95% of the original observed values
2025-02-25 21:33:41 [INFO]: 86352 values masked out in the test set as ground truth, take 9.97% of the original observed values
2025-02-25 21:33:41 [INFO]: Total sample number: 11988
2025-02-25 21:33:41 [INFO]: Training set size: 7671 (63.99%)
2025-02-25 21:33:41 [INFO]: Validation set size: 1918 (16.00%)
2025-02-25 21:33:41 [INFO]: Test set size: 2399 (20.01%)
2025-02-25 21

In [2]:
# Assemble the datasets for training, validating, and testing.

dataset_for_training = {
    "X": physionet2012_dataset['train_X'],
}

dataset_for_validating = {
    "X": physionet2012_dataset['val_X'],
    "X_ori": physionet2012_dataset['val_X_ori'],
}

dataset_for_testing = {
    "X": physionet2012_dataset['test_X'],
}


In [3]:
dataset_for_training['X']

array([[[            nan,             nan,             nan, ...,
                     nan, -3.34186687e+00,             nan],
        [            nan,             nan,             nan, ...,
                     nan,             nan,             nan],
        [            nan,             nan,             nan, ...,
                     nan,             nan,             nan],
        ...,
        [            nan,             nan,             nan, ...,
                     nan,             nan,             nan],
        [            nan,             nan,             nan, ...,
                     nan,             nan,             nan],
        [            nan,             nan,             nan, ...,
                     nan,             nan,             nan]],

       [[            nan,             nan,             nan, ...,
                     nan, -2.82949840e-01,  2.50627088e-03],
        [            nan,             nan,             nan, ...,
         -4.98983937e-01,             

In [4]:
physionet2012_dataset['n_steps']

48

In [5]:
physionet2012_dataset['n_features']

37

## 🚀 An example of **SAITS** for imputation

In [10]:
from pypots.optim import Adam
from pypots.imputation import SAITS

# initialize the model
saits = SAITS(
    n_steps=physionet2012_dataset['n_steps'],
    n_features=physionet2012_dataset['n_features'],
    n_layers=2,
    d_model=256,
    d_ffn=128,
    n_heads=4,
    d_k=64,
    d_v=64,
    dropout=0.1,
    attn_dropout=0.1,
    ##### Esto diferencia al SAIT del modelo Transformer
    diagonal_attention_mask=True,  # otherwise the original self-attention mechanism will be applied
    ORT_weight=1,  # you can adjust the weight values of arguments ORT_weight
    # and MIT_weight to make the SAITS model focus more on one task. Usually you can just leave them to the default values, i.e. 1.
    MIT_weight=1,
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=10,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # just leave it to default as None, PyPOTS will automatically assign the best device for you.
    # Set it as 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices, even parallelly on ['cuda:0', 'cuda:1']
    device=None,  
    # set the path for saving tensorboard and trained model files 
    saving_path="tutorial_results/imputation/saits",
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)


2025-02-25 21:40:45 [INFO]: No given device, using default device: cpu
2025-02-25 21:40:45 [INFO]: Model files will be saved to tutorial_results/imputation/saits/20250225_T214045
2025-02-25 21:40:45 [INFO]: Tensorboard file will be saved to tutorial_results/imputation/saits/20250225_T214045/tensorboard
2025-02-25 21:40:45 [INFO]: SAITS initialized with the given hyperparameters, the number of trainable parameters: 1,378,358


In [11]:
# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
saits.fit(train_set=dataset_for_training, val_set=dataset_for_validating)


2025-02-25 21:41:49 [INFO]: Epoch 001 - training loss: 0.7339, validation loss: 0.4262
2025-02-25 21:42:39 [INFO]: Epoch 002 - training loss: 0.5421, validation loss: 0.4099
2025-02-25 21:43:31 [INFO]: Epoch 003 - training loss: 0.4953, validation loss: 0.3733
2025-02-25 21:44:07 [INFO]: Epoch 004 - training loss: 0.4601, validation loss: 0.3560
2025-02-25 21:44:42 [INFO]: Epoch 005 - training loss: 0.4384, validation loss: 0.3392
2025-02-25 21:45:18 [INFO]: Epoch 006 - training loss: 0.4151, validation loss: 0.3304
2025-02-25 21:45:55 [INFO]: Epoch 007 - training loss: 0.3985, validation loss: 0.3278
2025-02-25 21:46:30 [INFO]: Epoch 008 - training loss: 0.3877, validation loss: 0.3241
2025-02-25 21:47:06 [INFO]: Epoch 009 - training loss: 0.3812, validation loss: 0.3189
2025-02-25 21:47:42 [INFO]: Epoch 010 - training loss: 0.3724, validation loss: 0.3167
2025-02-25 21:47:42 [INFO]: Finished training. The best model is from epoch#10.
2025-02-25 21:47:42 [INFO]: Saved the model to tut

In [12]:
# the testing stage, impute the originally-missing values and artificially-missing values in the test set

saits_results = saits.predict(dataset_for_testing)
saits_imputation = saits_results["imputation"]


In [13]:
saits_imputation

array([[[-5.4385591e-01, -3.9818963e-01, -3.1251264e-01, ...,
         -3.4369278e-01, -1.0496655e+00,  2.6155731e-03],
        [ 2.2478100e-02, -2.2300261e-01, -1.4868762e-01, ...,
         -7.8569335e-01, -1.0496655e+00,  2.2831559e-03],
        [-5.4463887e-01, -3.4745437e-01, -2.9453677e-01, ...,
         -3.1145626e-01, -1.0496655e+00, -2.5334309e-03],
        ...,
        [-5.2853882e-01, -3.4087366e-01, -3.1559777e-01, ...,
         -3.2069087e-01, -1.0496655e+00,  2.4970931e-03],
        [-4.8634955e-01, -3.2567972e-01, -2.9759574e-01, ...,
         -2.9512572e-01, -1.0496655e+00,  2.4509071e-03],
        [-5.6467801e-01, -3.3407164e-01, -3.4903508e-01, ...,
         -3.0918378e-01, -1.0496655e+00,  2.5065220e-03]],

       [[-4.0907586e-01, -2.6457563e-01, -2.5503874e-01, ...,
          2.8050733e-01, -1.3758801e+00, -5.7525113e-03],
        [-4.0177315e-01, -4.5280036e-01, -3.8542396e-01, ...,
          3.6182575e-02, -1.3793929e+00, -1.8499468e-02],
        [-5.1438874e-01, 

In [15]:
import numpy as np
np.mean(saits_imputation, axis=0)

array([[-0.50989085, -0.36246145, -0.32518214, ..., -0.08309655,
        -0.31181836, -0.01096718],
       [-0.52665305, -0.32788512, -0.31562158, ..., -0.07947987,
        -0.09837142, -0.0109406 ],
       [-0.51331156, -0.3152645 , -0.3123883 , ..., -0.07330464,
        -0.09311794, -0.01073911],
       ...,
       [-0.43129858, -0.34363335, -0.30473357, ..., -0.20775025,
        -0.0294342 , -0.00497112],
       [-0.45912376, -0.33366537, -0.32008284, ..., -0.20705797,
        -0.02986197, -0.00422864],
       [-0.4829212 , -0.33076856, -0.3313236 , ..., -0.19852372,
        -0.03277353, -0.00243696]], shape=(48, 37), dtype=float32)

In [16]:
saits_imputation.shape

(2399, 48, 37)

In [17]:
from pypots.utils.metrics import calc_mae

physionet2012_dataset['test_X_indicating_mask'] = np.isnan(physionet2012_dataset['test_X']).astype(int)

# Crear una copia de los datos originales
test_X_ori_fixed = np.copy(physionet2012_dataset['test_X_ori'])

# Reemplazar NaNs por 0 (valor constante)
test_X_ori_fixed[np.isnan(test_X_ori_fixed)] = 0

# Verificar si quedan NaNs
num_nans_fixed = np.isnan(test_X_ori_fixed).sum()
print(f"Número de NaNs en test_X_ori después de imputación (con 0): {num_nans_fixed}")

# calculate mean absolute error on the ground truth (artificially-missing values)
# predictions, targets, masks
testing_mae = calc_mae(
    saits_imputation, 
    test_X_ori_fixed, 
    physionet2012_dataset['test_X_indicating_mask'],
)
print(f"Testing mean absolute error (MAE): {testing_mae:.4f}")

Número de NaNs en test_X_ori después de imputación (con 0): 0
Testing mean absolute error (MAE): 0.3040
