# 📑 Tutorials for PyPOTS Imputation Models

## 📀 Preparing the **PhysioNet-2012** dataset for this tutorial

In [1]:
from pypots.data.generating import gene_physionet2012
from pypots.utils.random import set_random_seed
from global_config import RANDOM_SEED

set_random_seed(RANDOM_SEED)

# Load the PhysioNet-2012 dataset
physionet2012_dataset = gene_physionet2012(artificially_missing_rate=0.1)

# Take a look at the generated PhysioNet-2012 dataset, you'll find that everything has been prepared for you,
# data splitting, normalization, additional artificially-missing values for evaluation, etc.
print(physionet2012_dataset.keys())


  @autocast(enabled=False)
  @autocast(enabled=False)
2025-02-24 17:38:11 [INFO]: Have set the random seed as 16 for numpy and pytorch.
2025-02-24 17:38:11 [INFO]: You're using dataset physionet_2012, please cite it properly in your work. You can find its reference information at the below link: 
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/physionet_2012
2025-02-24 17:38:11 [INFO]: Dataset physionet_2012 has already been downloaded. Processing directly...
2025-02-24 17:38:11 [INFO]: Dataset physionet_2012 has already been cached. Loading from cache directly...
2025-02-24 17:38:11 [INFO]: Loaded successfully!


[34m
████████╗██╗███╗   ███╗███████╗    ███████╗███████╗██████╗ ██╗███████╗███████╗    █████╗ ██╗
╚══██╔══╝██║████╗ ████║██╔════╝    ██╔════╝██╔════╝██╔══██╗██║██╔════╝██╔════╝   ██╔══██╗██║
   ██║   ██║██╔████╔██║█████╗█████╗███████╗█████╗  ██████╔╝██║█████╗  ███████╗   ███████║██║
   ██║   ██║██║╚██╔╝██║██╔══╝╚════╝╚════██║██╔══╝  ██╔══██╗██║██╔══╝  ╚════██║   ██╔══██║██║
   ██║   ██║██║ ╚═╝ ██║███████╗    ███████║███████╗██║  ██║██║███████╗███████║██╗██║  ██║██║
   ╚═╝   ╚═╝╚═╝     ╚═╝╚══════╝    ╚══════╝╚══════╝╚═╝  ╚═╝╚═╝╚══════╝╚══════╝╚═╝╚═╝  ╚═╝╚═╝
ai4ts v0.0.3 - building AI for unified time-series analysis, https://time-series.ai [0m



2025-02-24 17:38:24 [INFO]: 69461 values masked out in the val set as ground truth, take 10.02% of the original observed values
2025-02-24 17:38:25 [INFO]: 86037 values masked out in the test set as ground truth, take 10.01% of the original observed values
2025-02-24 17:38:25 [INFO]: Total sample number: 11988
2025-02-24 17:38:25 [INFO]: Training set size: 7671 (63.99%)
2025-02-24 17:38:25 [INFO]: Validation set size: 1918 (16.00%)
2025-02-24 17:38:25 [INFO]: Test set size: 2399 (20.01%)
2025-02-24 17:38:25 [INFO]: Number of steps: 48
2025-02-24 17:38:25 [INFO]: Number of features: 37
2025-02-24 17:38:25 [INFO]: Train set missing rate: 79.69%
2025-02-24 17:38:25 [INFO]: Validating set missing rate: 81.70%
2025-02-24 17:38:25 [INFO]: Test set missing rate: 81.84%
2025-02-24 17:38:25 [INFO]: 🌟 Please refer to https://github.com/WenjieDu/BenchPOTS and check out the func benchpots.datasets.preprocess_physionet2012()


dict_keys(['n_classes', 'n_steps', 'n_features', 'scaler', 'train_X', 'train_y', 'train_ICUType', 'val_X', 'val_y', 'val_ICUType', 'test_X', 'test_y', 'test_ICUType', 'val_X_ori', 'test_X_ori'])


In [46]:
# Assemble the datasets for training, validating, and testing.

dataset_for_training = {
    "X": physionet2012_dataset['train_X'],
}

dataset_for_validating = {
    "X": physionet2012_dataset['val_X'],
    "X_ori": physionet2012_dataset['val_X_ori'],
}

dataset_for_testing = {
    "X": physionet2012_dataset['test_X'],
}


In [3]:
dataset_for_training['X']

array([[[            nan,             nan,             nan, ...,
                     nan, -3.31084528e+00,             nan],
        [            nan,             nan,             nan, ...,
                     nan,             nan,             nan],
        [            nan,             nan,             nan, ...,
                     nan,             nan,             nan],
        ...,
        [            nan,             nan,             nan, ...,
                     nan,             nan,             nan],
        [            nan,             nan,             nan, ...,
                     nan,             nan,             nan],
        [            nan,             nan,             nan, ...,
                     nan,             nan,             nan]],

       [[            nan,             nan,             nan, ...,
                     nan, -2.95065256e-01,  2.15766049e-03],
        [            nan,             nan,             nan, ...,
         -4.73794630e-01,             

In [4]:
physionet2012_dataset['n_steps']

48

In [5]:
physionet2012_dataset['n_features']

37

## 🚀 An example of **SAITS** for imputation

In [6]:
from pypots.optim import Adam
from pypots.imputation import SAITS

# initialize the model
saits = SAITS(
    n_steps=physionet2012_dataset['n_steps'],
    n_features=physionet2012_dataset['n_features'],
    n_layers=2,
    d_model=256,
    d_ffn=128,
    n_heads=4,
    d_k=64,
    d_v=64,
    dropout=0.1,
    attn_dropout=0.1,
    ##### Esto diferencia al SAIT del modelo Transformer
    diagonal_attention_mask=True,  # otherwise the original self-attention mechanism will be applied
    ORT_weight=1,  # you can adjust the weight values of arguments ORT_weight
    # and MIT_weight to make the SAITS model focus more on one task. Usually you can just leave them to the default values, i.e. 1.
    MIT_weight=1,
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=10,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # just leave it to default as None, PyPOTS will automatically assign the best device for you.
    # Set it as 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices, even parallelly on ['cuda:0', 'cuda:1']
    device=None,  
    # set the path for saving tensorboard and trained model files 
    saving_path="tutorial_results/imputation/saits",
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)


2025-02-24 17:38:25 [INFO]: No given device, using default device: cpu
2025-02-24 17:38:25 [INFO]: Model files will be saved to tutorial_results/imputation/saits/20250224_T173825
2025-02-24 17:38:25 [INFO]: Tensorboard file will be saved to tutorial_results/imputation/saits/20250224_T173825/tensorboard
2025-02-24 17:38:25 [INFO]: SAITS initialized with the given hyperparameters, the number of trainable parameters: 1,378,358


In [7]:
# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
saits.fit(train_set=dataset_for_training, val_set=dataset_for_validating)


2025-02-24 17:39:23 [INFO]: Epoch 001 - training loss: 0.7263, validation loss: 0.3443
2025-02-24 17:40:15 [INFO]: Epoch 002 - training loss: 0.5407, validation loss: 0.3227
2025-02-24 17:41:22 [INFO]: Epoch 003 - training loss: 0.4928, validation loss: 0.2973
2025-02-24 17:42:08 [INFO]: Epoch 004 - training loss: 0.4589, validation loss: 0.2726
2025-02-24 17:42:50 [INFO]: Epoch 005 - training loss: 0.4330, validation loss: 0.2637
2025-02-24 17:43:31 [INFO]: Epoch 006 - training loss: 0.4131, validation loss: 0.2581
2025-02-24 17:44:16 [INFO]: Epoch 007 - training loss: 0.3959, validation loss: 0.2482
2025-02-24 17:45:03 [INFO]: Epoch 008 - training loss: 0.3841, validation loss: 0.2512
2025-02-24 17:46:08 [INFO]: Epoch 009 - training loss: 0.3759, validation loss: 0.2401
2025-02-24 17:46:52 [INFO]: Epoch 010 - training loss: 0.3718, validation loss: 0.2395
2025-02-24 17:46:52 [INFO]: Finished training. The best model is from epoch#10.
2025-02-24 17:46:52 [INFO]: Saved the model to tut

In [17]:
# the testing stage, impute the originally-missing values and artificially-missing values in the test set

saits_results = saits.predict(dataset_for_testing)
saits_imputation = saits_results["imputation"]


In [19]:
saits_imputation

array([[[-3.39879394e-01, -4.37852383e-01, -4.21492040e-01, ...,
         -8.89166966e-02, -1.05096853e+00,  7.64137646e-03],
        [-1.47559956e-01, -2.59139180e-01, -1.85112804e-01, ...,
         -3.11014652e-01, -1.05096853e+00,  1.15424506e-02],
        [-4.09875810e-01, -4.41974819e-01, -4.40624833e-01, ...,
         -6.33399338e-02, -1.05096853e+00,  4.20941040e-03],
        ...,
        [-3.45102191e-01, -3.58970135e-01, -3.88035297e-01, ...,
         -1.18300311e-01, -1.05096853e+00,  3.33231990e-03],
        [-2.90943265e-01, -3.55989158e-01, -3.90741646e-01, ...,
         -1.04197644e-01, -1.05096853e+00,  5.07524936e-04],
        [-3.78139615e-01, -4.38871443e-01, -4.37648267e-01, ...,
         -1.44222736e-01, -1.05096853e+00,  1.11249043e-04]],

       [[-8.17321241e-02, -3.19774091e-01, -3.06260765e-01, ...,
         -1.22224152e-01,  4.17621247e-02,  3.61189805e-03],
        [-4.16784525e-01, -5.25243700e-01, -4.85108107e-01, ...,
         -8.97674412e-02,  4.17621247e

In [49]:
np.mean(saits_imputation, axis=0)

array([[-0.1621629 , -0.29536228, -0.27797275, ...,  0.01900954,
        -0.36396021, -0.00995649],
       [-0.16312107, -0.29983069, -0.28483136, ...,  0.01345677,
        -0.25902407, -0.00474422],
       [-0.16725748, -0.30267454, -0.28715961, ...,  0.01724678,
        -0.25595969, -0.00416867],
       ...,
       [-0.17454517, -0.30238643, -0.28782827, ...,  0.00449406,
        -0.12005425, -0.00915084],
       [-0.17096113, -0.30405467, -0.28838514, ...,  0.00160955,
        -0.12269905, -0.00932488],
       [-0.17276722, -0.29471161, -0.27505285, ...,  0.00812919,
        -0.13113304, -0.00932557]], shape=(48, 37))

In [18]:
saits_imputation.shape

(2399, 48, 37)

In [51]:
from pypots.utils.metrics import calc_mae

physionet2012_dataset['test_X_indicating_mask'] = np.isnan(physionet2012_dataset['test_X']).astype(int)

# Crear una copia de los datos originales
test_X_ori_fixed = np.copy(physionet2012_dataset['test_X_ori'])

# Reemplazar NaNs por 0 (valor constante)
test_X_ori_fixed[np.isnan(test_X_ori_fixed)] = 0

# Verificar si quedan NaNs
num_nans_fixed = np.isnan(test_X_ori_fixed).sum()
print(f"Número de NaNs en test_X_ori después de imputación (con 0): {num_nans_fixed}")

# calculate mean absolute error on the ground truth (artificially-missing values)
# predictions, targets, masks
testing_mae = calc_mae(
    saits_imputation, 
    test_X_ori_fixed, 
    physionet2012_dataset['test_X_indicating_mask'],
)
print(f"Testing mean absolute error (MAE): {testing_mae:.4f}")

Número de NaNs en test_X_ori después de imputación (con 0): 0
Testing mean absolute error (MAE): 0.3018
