# 📑 Tutorials for PyPOTS Imputation Models

## 📀 Preparing the **PhysioNet-2012** dataset for this tutorial

In [2]:
from pypots.data.generating import gene_physionet2012
from pypots.utils.random import set_random_seed
from global_config import RANDOM_SEED

set_random_seed(RANDOM_SEED)

# Load the PhysioNet-2012 dataset
physionet2012_dataset = gene_physionet2012(artificially_missing_rate=0.1)

# Take a look at the generated PhysioNet-2012 dataset, you'll find that everything has been prepared for you,
# data splitting, normalization, additional artificially-missing values for evaluation, etc.
print(physionet2012_dataset.keys())


2025-02-24 17:18:22 [INFO]: Have set the random seed as 16 for numpy and pytorch.
2025-02-24 17:18:22 [INFO]: You're using dataset physionet_2012, please cite it properly in your work. You can find its reference information at the below link: 
https://github.com/WenjieDu/TSDB/tree/main/dataset_profiles/physionet_2012
2025-02-24 17:18:22 [INFO]: Dataset physionet_2012 has already been downloaded. Processing directly...
2025-02-24 17:18:22 [INFO]: Dataset physionet_2012 has already been cached. Loading from cache directly...
2025-02-24 17:18:23 [INFO]: Loaded successfully!
2025-02-24 17:18:36 [INFO]: 69461 values masked out in the val set as ground truth, take 10.02% of the original observed values
2025-02-24 17:18:36 [INFO]: 86037 values masked out in the test set as ground truth, take 10.01% of the original observed values
2025-02-24 17:18:36 [INFO]: Total sample number: 11988
2025-02-24 17:18:36 [INFO]: Training set size: 7671 (63.99%)
2025-02-24 17:18:36 [INFO]: Validation set size: 

dict_keys(['n_classes', 'n_steps', 'n_features', 'scaler', 'train_X', 'train_y', 'train_ICUType', 'val_X', 'val_y', 'val_ICUType', 'test_X', 'test_y', 'test_ICUType', 'val_X_ori', 'test_X_ori'])


In [3]:
# Assemble the datasets for training, validating, and testing.

dataset_for_training = {
    "X": physionet2012_dataset['train_X'],
}

dataset_for_validating = {
    "X": physionet2012_dataset['val_X'],
    "X_ori": physionet2012_dataset['val_X_ori'],
}

dataset_for_testing = {
    "X": physionet2012_dataset['test_X'],
}


In [4]:
dataset_for_training['X']

array([[[            nan,             nan,             nan, ...,
                     nan, -3.31084528e+00,             nan],
        [            nan,             nan,             nan, ...,
                     nan,             nan,             nan],
        [            nan,             nan,             nan, ...,
                     nan,             nan,             nan],
        ...,
        [            nan,             nan,             nan, ...,
                     nan,             nan,             nan],
        [            nan,             nan,             nan, ...,
                     nan,             nan,             nan],
        [            nan,             nan,             nan, ...,
                     nan,             nan,             nan]],

       [[            nan,             nan,             nan, ...,
                     nan, -2.95065256e-01,  2.15766049e-03],
        [            nan,             nan,             nan, ...,
         -4.73794630e-01,             

In [5]:
physionet2012_dataset['n_steps']

48

In [6]:
physionet2012_dataset['n_features']

37

## 🚀 An example of **SAITS** for imputation

In [7]:
from pypots.optim import Adam
from pypots.imputation import SAITS

# initialize the model
saits = SAITS(
    n_steps=physionet2012_dataset['n_steps'],
    n_features=physionet2012_dataset['n_features'],
    n_layers=2,
    d_model=256,
    d_ffn=128,
    n_heads=4,
    d_k=64,
    d_v=64,
    dropout=0.1,
    attn_dropout=0.1,
    ##### Esto diferencia al SAIT del modelo Transformer
    diagonal_attention_mask=True,  # otherwise the original self-attention mechanism will be applied
    ORT_weight=1,  # you can adjust the weight values of arguments ORT_weight
    # and MIT_weight to make the SAITS model focus more on one task. Usually you can just leave them to the default values, i.e. 1.
    MIT_weight=1,
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=10,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # just leave it to default as None, PyPOTS will automatically assign the best device for you.
    # Set it as 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices, even parallelly on ['cuda:0', 'cuda:1']
    device=None,  
    # set the path for saving tensorboard and trained model files 
    saving_path="tutorial_results/imputation/saits",
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)


2025-02-24 17:18:37 [INFO]: No given device, using default device: cpu
2025-02-24 17:18:37 [INFO]: Model files will be saved to tutorial_results/imputation/saits/20250224_T171837
2025-02-24 17:18:37 [INFO]: Tensorboard file will be saved to tutorial_results/imputation/saits/20250224_T171837/tensorboard
2025-02-24 17:18:37 [INFO]: SAITS initialized with the given hyperparameters, the number of trainable parameters: 1,378,358


In [None]:
# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
saits.fit(train_set=dataset_for_training, val_set=dataset_for_validating)


2025-02-24 17:27:38 [INFO]: Epoch 001 - training loss: 0.3648, validation loss: 0.2378
2025-02-24 17:28:18 [INFO]: Epoch 002 - training loss: 0.3593, validation loss: 0.2386
2025-02-24 17:29:03 [INFO]: Epoch 003 - training loss: 0.3569, validation loss: 0.2390


In [None]:
# the testing stage, impute the originally-missing values and artificially-missing values in the test set

# imputación en el conjunto de entrenamiento!

saits_results = saits.predict(dataset_for_training)
saits_imputation = saits_results["imputation"]


In [None]:
saits_imputation

In [None]:
saits_imputation.shape

In [1]:
from pypots.utils.metrics import calc_mae

# calculate mean absolute error on the ground truth (artificially-missing values)
testing_mae = calc_mae(
    saits_imputation, 
    physionet2012_dataset['test_X_ori'], 
    physionet2012_dataset['test_X_indicating_mask'],
)
print(f"Testing mean absolute error: {testing_mae:.4f}")


  @autocast(enabled=False)
  @autocast(enabled=False)


[34m
████████╗██╗███╗   ███╗███████╗    ███████╗███████╗██████╗ ██╗███████╗███████╗    █████╗ ██╗
╚══██╔══╝██║████╗ ████║██╔════╝    ██╔════╝██╔════╝██╔══██╗██║██╔════╝██╔════╝   ██╔══██╗██║
   ██║   ██║██╔████╔██║█████╗█████╗███████╗█████╗  ██████╔╝██║█████╗  ███████╗   ███████║██║
   ██║   ██║██║╚██╔╝██║██╔══╝╚════╝╚════██║██╔══╝  ██╔══██╗██║██╔══╝  ╚════██║   ██╔══██║██║
   ██║   ██║██║ ╚═╝ ██║███████╗    ███████║███████╗██║  ██║██║███████╗███████║██╗██║  ██║██║
   ╚═╝   ╚═╝╚═╝     ╚═╝╚══════╝    ╚══════╝╚══════╝╚═╝  ╚═╝╚═╝╚══════╝╚══════╝╚═╝╚═╝  ╚═╝╚═╝
ai4ts v0.0.3 - building AI for unified time-series analysis, https://time-series.ai [0m



NameError: name 'saits_imputation' is not defined

## 🚀 An example of **Transformer** for imputation

In [8]:
from pypots.optim import Adam
from pypots.imputation import Transformer

# initialize the model
transformer = Transformer(
    n_steps=physionet2012_dataset['n_steps'],
    n_features=physionet2012_dataset['n_features'],
    n_layers=6,
    d_model=512,
    d_ffn=256,
    n_heads=4,
    d_k=128,
    d_v=128,
    dropout=0.1,
    attn_dropout=0,
    ORT_weight=1,  # you can adjust the weight values of arguments ORT_weight
    # and MIT_weight to make the SAITS model focus more on one task. Usually you can just leave them to the default values, i.e. 1.
    MIT_weight=1,
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=10,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # just leave it to default as None, PyPOTS will automatically assign the best device for you.
    # Set it as 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices, even parallelly on ['cuda:0', 'cuda:1']
    device=None,  
    # set the path for saving tensorboard and trained model files 
    saving_path="tutorial_results/imputation/transformer",
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)


2024-05-21 12:33:33 [INFO]: Using the given device: cuda:0
2024-05-21 12:33:33 [INFO]: Model files will be saved to tutorial_results/imputation/transformer/20240521_T123333
2024-05-21 12:33:33 [INFO]: Tensorboard file will be saved to tutorial_results/imputation/transformer/20240521_T123333/tensorboard
2024-05-21 12:33:33 [INFO]: Transformer initialized with the given hyperparameters, the number of trainable parameters: 7,938,597


In [9]:
# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
transformer.fit(train_set=dataset_for_training, val_set=dataset_for_validating)


2024-05-21 12:33:41 [INFO]: Epoch 001 - training loss: 1.4364, validation loss: 1.0100
2024-05-21 12:33:48 [INFO]: Epoch 002 - training loss: 1.3903, validation loss: 0.9911
2024-05-21 12:33:55 [INFO]: Epoch 003 - training loss: 1.3806, validation loss: 0.9910
2024-05-21 12:34:03 [INFO]: Epoch 004 - training loss: 1.3697, validation loss: 0.9919
2024-05-21 12:34:10 [INFO]: Epoch 005 - training loss: 1.3679, validation loss: 0.9864
2024-05-21 12:34:18 [INFO]: Epoch 006 - training loss: 1.3655, validation loss: 0.9801
2024-05-21 12:34:25 [INFO]: Epoch 007 - training loss: 1.3645, validation loss: 0.9849
2024-05-21 12:34:32 [INFO]: Epoch 008 - training loss: 1.3643, validation loss: 0.9886
2024-05-21 12:34:40 [INFO]: Epoch 009 - training loss: 1.3642, validation loss: 0.9945
2024-05-21 12:34:40 [INFO]: Exceeded the training patience. Terminating the training procedure...
2024-05-21 12:34:40 [INFO]: Finished training. The best model is from epoch#6.
2024-05-21 12:34:40 [INFO]: Saved the mo

In [10]:
# the testing stage, impute the originally-missing values and artificially-missing values in the test set
transformer_results = transformer.predict(dataset_for_testing)
transformer_imputation = transformer_results["imputation"]

NameError: name 'transformer' is not defined

In [11]:
from pypots.utils.metrics import calc_mae

# calculate mean absolute error on the ground truth (artificially-missing values)
testing_mae = calc_mae(
    transformer_imputation,
    physionet2012_dataset['test_X_ori'],
    physionet2012_dataset['test_X_indicating_mask'],
)
print(f"Testing mean absolute error: {testing_mae:.4f}")


Testing mean absolute error: 0.6862


## 🚀 An example of **TimesNet** for imputation

In [12]:
from pypots.optim import Adam
from pypots.imputation import TimesNet

# initialize the model
timesnet = TimesNet(
    n_steps=physionet2012_dataset['n_steps'],
    n_features=physionet2012_dataset['n_features'],
    n_layers=1,
    top_k=1,
    d_model=128,
    d_ffn=512,
    n_kernels=5,
    dropout=0.5,
    apply_nonstationary_norm=False,
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=10,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # just leave it to default as None, PyPOTS will automatically assign the best device for you.
    # Set it as 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices, even parallelly on ['cuda:0', 'cuda:1']
    device=None,
    # set the path for saving tensorboard and trained model files 
    saving_path="tutorial_results/imputation/timesnet",
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)

2024-05-21 12:34:40 [INFO]: Using the given device: cuda:0
2024-05-21 12:34:40 [INFO]: Model files will be saved to tutorial_results/imputation/timesnet/20240521_T123440
2024-05-21 12:34:40 [INFO]: Tensorboard file will be saved to tutorial_results/imputation/timesnet/20240521_T123440/tensorboard
2024-05-21 12:34:41 [INFO]: TimesNet initialized with the given hyperparameters, the number of trainable parameters: 21,649,573


In [13]:
# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
timesnet.fit(train_set=dataset_for_training, val_set=dataset_for_validating)


2024-05-21 12:34:48 [INFO]: Epoch 001 - training loss: 0.4796, validation loss: 0.3719
2024-05-21 12:34:52 [INFO]: Epoch 002 - training loss: 0.4039, validation loss: 0.3458
2024-05-21 12:34:55 [INFO]: Epoch 003 - training loss: 0.4087, validation loss: 0.3382
2024-05-21 12:34:59 [INFO]: Epoch 004 - training loss: 0.4472, validation loss: 0.3351
2024-05-21 12:35:02 [INFO]: Epoch 005 - training loss: 0.4144, validation loss: 0.3300
2024-05-21 12:35:06 [INFO]: Epoch 006 - training loss: 0.4142, validation loss: 0.3274
2024-05-21 12:35:09 [INFO]: Epoch 007 - training loss: 0.4203, validation loss: 0.3257
2024-05-21 12:35:13 [INFO]: Epoch 008 - training loss: 0.4631, validation loss: 0.3249
2024-05-21 12:35:16 [INFO]: Epoch 009 - training loss: 0.3840, validation loss: 0.3172
2024-05-21 12:35:20 [INFO]: Epoch 010 - training loss: 0.3685, validation loss: 0.3151
2024-05-21 12:35:20 [INFO]: Finished training. The best model is from epoch#10.
2024-05-21 12:35:20 [INFO]: Saved the model to tut

In [14]:
# the testing stage, impute the originally-missing values and artificially-missing values in the test set
timesnet_results = timesnet.predict(dataset_for_testing)
timesnet_imputation = timesnet_results["imputation"]

In [15]:
from pypots.utils.metrics import calc_mae

# calculate mean absolute error on the ground truth (artificially-missing values)
testing_mae = calc_mae(
    timesnet_imputation,
    physionet2012_dataset['test_X_ori'],
    physionet2012_dataset['test_X_indicating_mask'],
)
print(f"Testing mean absolute error: {testing_mae:.4f}")


Testing mean absolute error: 0.3243


## 🚀 An example of **CSDI** for imputation

In [16]:
from pypots.optim import Adam
from pypots.imputation import CSDI

# initialize the model
csdi = CSDI(
    n_steps=physionet2012_dataset['n_steps'],
    n_features=physionet2012_dataset['n_features'],
    n_layers=6,
    n_heads=2,
    n_channels=128,
    d_time_embedding=64,
    d_feature_embedding=32,
    d_diffusion_embedding=128,
    target_strategy="random",
    n_diffusion_steps=50,
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=10,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # just leave it to default as None, PyPOTS will automatically assign the best device for you.
    # Set it as 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices, even parallelly on ['cuda:0', 'cuda:1']
    device=None,
    # set the path for saving tensorboard and trained model files 
    saving_path="tutorial_results/imputation/csdi",
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)

2024-05-21 12:35:21 [INFO]: Using the given device: cuda:0
2024-05-21 12:35:21 [INFO]: Model files will be saved to tutorial_results/imputation/csdi/20240521_T123521
2024-05-21 12:35:21 [INFO]: Tensorboard file will be saved to tutorial_results/imputation/csdi/20240521_T123521/tensorboard
2024-05-21 12:35:21 [INFO]: CSDI initialized with the given hyperparameters, the number of trainable parameters: 1,694,753


In [17]:
# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
csdi.fit(train_set=dataset_for_training, val_set=dataset_for_validating)


2024-05-21 12:37:43 [INFO]: Epoch 001 - training loss: 0.3379, validation loss: 0.2506
2024-05-21 12:40:08 [INFO]: Epoch 002 - training loss: 0.2575, validation loss: 0.2083
2024-05-21 12:42:31 [INFO]: Epoch 003 - training loss: 0.2406, validation loss: 0.1998
2024-05-21 12:44:56 [INFO]: Epoch 004 - training loss: 0.2389, validation loss: 0.1978
2024-05-21 12:47:20 [INFO]: Epoch 005 - training loss: 0.2343, validation loss: 0.1984
2024-05-21 12:49:44 [INFO]: Epoch 006 - training loss: 0.2272, validation loss: 0.1907
2024-05-21 12:52:07 [INFO]: Epoch 007 - training loss: 0.2298, validation loss: 0.1924
2024-05-21 12:54:32 [INFO]: Epoch 008 - training loss: 0.2208, validation loss: 0.1882
2024-05-21 12:56:56 [INFO]: Epoch 009 - training loss: 0.2267, validation loss: 0.1827
2024-05-21 12:59:20 [INFO]: Epoch 010 - training loss: 0.2188, validation loss: 0.1845
2024-05-21 12:59:20 [INFO]: Finished training. The best model is from epoch#9.
2024-05-21 12:59:20 [INFO]: Saved the model to tuto

In [18]:
# the testing stage, impute the originally-missing values and artificially-missing values in the test set

# CSDI has an argument to control the number of sampling times during inference
csdi_results = csdi.predict(dataset_for_testing, n_sampling_times=2)
csdi_imputation = csdi_results["imputation"]

print(f"The shape of csdi_imputation is {csdi_imputation.shape}")

# for error calculation, we need to take the mean value of the multiple samplings for each data sample
mean_csdi_imputation = csdi_imputation.mean(axis=1)

The shape of csdi_imputation is (2398, 2, 48, 37)


In [19]:
from pypots.utils.metrics import calc_mae

# calculate mean absolute error on the ground truth (artificially-missing values)
testing_mae = calc_mae(
    mean_csdi_imputation,
    physionet2012_dataset['test_X_ori'],
    physionet2012_dataset['test_X_indicating_mask'],
)
print(f"Testing mean absolute error: {testing_mae:.4f}")


Testing mean absolute error: 0.2781


## 🚀 An example of **US-GAN** for imputation

In [20]:
from pypots.optim import Adam
from pypots.imputation import USGAN

# initialize the model
us_gan = USGAN(
    n_steps=physionet2012_dataset['n_steps'],
    n_features=physionet2012_dataset['n_features'],
    rnn_hidden_size=256,
    lambda_mse=1,
    dropout=0.1,
    G_steps=1,
    D_steps=1,
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=10,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    G_optimizer=Adam(lr=1e-3),
    D_optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # just leave it to default as None, PyPOTS will automatically assign the best device for you.
    # Set it as 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices, even parallelly on ['cuda:0', 'cuda:1']
    device=None,
    # set the path for saving tensorboard and trained model files 
    saving_path="tutorial_results/imputation/us_gan",
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)


2024-05-21 13:04:08 [INFO]: Using the given device: cuda:0
2024-05-21 13:04:08 [INFO]: Model files will be saved to tutorial_results/imputation/us_gan/20240521_T130408
2024-05-21 13:04:08 [INFO]: Tensorboard file will be saved to tutorial_results/imputation/us_gan/20240521_T130408/tensorboard
2024-05-21 13:04:08 [INFO]: USGAN initialized with the given hyperparameters, the number of trainable parameters: 1,258,517


In [21]:
# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
us_gan.fit(train_set=dataset_for_training, val_set=dataset_for_validating)


2024-05-21 13:06:21 [INFO]: Epoch 001 - generator training loss: 0.4250, discriminator training loss: 0.1838, validation loss: 0.3584
2024-05-21 13:08:16 [INFO]: Epoch 002 - generator training loss: 0.3517, discriminator training loss: 0.0526, validation loss: 0.3129
2024-05-21 13:10:10 [INFO]: Epoch 003 - generator training loss: 0.3263, discriminator training loss: 0.0367, validation loss: 0.2942
2024-05-21 13:12:00 [INFO]: Epoch 004 - generator training loss: 0.3103, discriminator training loss: 0.0310, validation loss: 0.2862
2024-05-21 13:13:53 [INFO]: Epoch 005 - generator training loss: 0.3010, discriminator training loss: 0.0281, validation loss: 0.2839
2024-05-21 13:15:47 [INFO]: Epoch 006 - generator training loss: 0.2950, discriminator training loss: 0.0281, validation loss: 0.2732
2024-05-21 13:17:40 [INFO]: Epoch 007 - generator training loss: 0.2836, discriminator training loss: 0.0258, validation loss: 0.2791
2024-05-21 13:19:37 [INFO]: Epoch 008 - generator training los

In [22]:
# the testing stage, impute the originally-missing values and artificially-missing values in the test set
us_gan_results = us_gan.predict(dataset_for_testing)
us_gan_imputation = us_gan_results["imputation"]

In [23]:
from pypots.utils.metrics import calc_mae

# calculate mean absolute error on the ground truth (artificially-missing values)
testing_mae = calc_mae(
    us_gan_imputation,
    physionet2012_dataset['test_X_ori'],
    physionet2012_dataset['test_X_indicating_mask'],
)
print(f"Testing mean absolute error: {testing_mae:.4f}")


Testing mean absolute error: 0.2805


## 🚀 An example of **GP-VAE** for imputation

In [24]:
from pypots.optim import Adam
from pypots.imputation import GPVAE

# initialize the model
gp_vae = GPVAE(
    n_steps=physionet2012_dataset['n_steps'],
    n_features=physionet2012_dataset['n_features'],
    latent_size=37,
    encoder_sizes=(128,128),
    decoder_sizes=(256,256),
    kernel="cauchy",
    beta=0.2,
    M=1,
    K=1,
    sigma=1.005,
    length_scale=7.0,
    kernel_scales=1,
    window_size=24,
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=10,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # just leave it to default as None, PyPOTS will automatically assign the best device for you.
    # Set it as 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices, even parallelly on ['cuda:0', 'cuda:1']
    device=None,
    # set the path for saving tensorboard and trained model files 
    saving_path="tutorial_results/imputation/gp_vae",
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)


2024-05-21 13:23:43 [INFO]: Using the given device: cuda:0
2024-05-21 13:23:43 [INFO]: Model files will be saved to tutorial_results/imputation/gp_vae/20240521_T132343
2024-05-21 13:23:43 [INFO]: Tensorboard file will be saved to tutorial_results/imputation/gp_vae/20240521_T132343/tensorboard
2024-05-21 13:23:43 [INFO]: GPVAE initialized with the given hyperparameters, the number of trainable parameters: 229,652


In [25]:
# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
gp_vae.fit(train_set=dataset_for_training, val_set=dataset_for_validating)


2024-05-21 13:23:47 [INFO]: Epoch 001 - training loss: 26022.2357, validation loss: 0.6172
2024-05-21 13:23:51 [INFO]: Epoch 002 - training loss: 22874.5681, validation loss: 0.5858
2024-05-21 13:23:55 [INFO]: Epoch 003 - training loss: 22840.0816, validation loss: 0.5715
2024-05-21 13:23:59 [INFO]: Epoch 004 - training loss: 22828.3269, validation loss: 0.5638
2024-05-21 13:24:03 [INFO]: Epoch 005 - training loss: 22821.9063, validation loss: 0.5509
2024-05-21 13:24:06 [INFO]: Epoch 006 - training loss: 22817.8396, validation loss: 0.5411
2024-05-21 13:24:10 [INFO]: Epoch 007 - training loss: 22816.0375, validation loss: 0.5412
2024-05-21 13:24:15 [INFO]: Epoch 008 - training loss: 22813.3457, validation loss: 0.5447
2024-05-21 13:24:19 [INFO]: Epoch 009 - training loss: 22812.1050, validation loss: 0.5212
2024-05-21 13:24:23 [INFO]: Epoch 010 - training loss: 22809.5779, validation loss: 0.5380
2024-05-21 13:24:23 [INFO]: Finished training. The best model is from epoch#9.
2024-05-21 

In [26]:
# the testing stage, impute the originally-missing values and artificially-missing values in the test set

# GP-VAE has an argument to control the number of sampling times during inference
gp_vae_results = gp_vae.predict(dataset_for_testing, n_sampling_times=2)
gp_vae_imputation = gp_vae_results["imputation"]

print(f"The shape of gp_vae_imputation is {gp_vae_imputation.shape}")

# for error calculation, we need to take the mean value of the multiple samplings for each data sample
mean_gp_vae_imputation = gp_vae_imputation.mean(axis=1)


The shape of gp_vae_imputation is (2398, 2, 48, 37)


In [27]:
from pypots.utils.metrics import calc_mae

# calculate mean absolute error on the ground truth (artificially-missing values)
testing_mae = calc_mae(
    mean_gp_vae_imputation,
    physionet2012_dataset['test_X_ori'],
    physionet2012_dataset['test_X_indicating_mask'],
)
print(f"Testing mean absolute error: {testing_mae:.4f}")


Testing mean absolute error: 0.4822


## 🚀 An example of **BRITS** for imputation

In [28]:
from pypots.optim import Adam
from pypots.imputation import BRITS

# initialize the model
brits = BRITS(
    n_steps=physionet2012_dataset['n_steps'],
    n_features=physionet2012_dataset['n_features'],
    rnn_hidden_size=128,
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=10,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # just leave it to default as None, PyPOTS will automatically assign the best device for you.
    # Set it as 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices, even parallelly on ['cuda:0', 'cuda:1']
    device=None,
    # set the path for saving tensorboard and trained model files 
    saving_path="tutorial_results/imputation/brits", 
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)


2024-05-21 13:24:24 [INFO]: Using the given device: cuda:0
2024-05-21 13:24:24 [INFO]: Model files will be saved to tutorial_results/imputation/brits/20240521_T132424
2024-05-21 13:24:24 [INFO]: Tensorboard file will be saved to tutorial_results/imputation/brits/20240521_T132424/tensorboard
2024-05-21 13:24:24 [INFO]: BRITS initialized with the given hyperparameters, the number of trainable parameters: 239,344


In [29]:
# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
brits.fit(train_set=dataset_for_training, val_set=dataset_for_validating)


2024-05-21 13:26:06 [INFO]: Epoch 001 - training loss: 0.9522, validation loss: 0.3980
2024-05-21 13:27:26 [INFO]: Epoch 002 - training loss: 0.7375, validation loss: 0.3423
2024-05-21 13:28:49 [INFO]: Epoch 003 - training loss: 0.6834, validation loss: 0.3278
2024-05-21 13:30:15 [INFO]: Epoch 004 - training loss: 0.6578, validation loss: 0.3228
2024-05-21 13:31:40 [INFO]: Epoch 005 - training loss: 0.6421, validation loss: 0.3166
2024-05-21 13:33:01 [INFO]: Epoch 006 - training loss: 0.6305, validation loss: 0.3149
2024-05-21 13:34:25 [INFO]: Epoch 007 - training loss: 0.6216, validation loss: 0.3136
2024-05-21 13:35:48 [INFO]: Epoch 008 - training loss: 0.6142, validation loss: 0.3158
2024-05-21 13:37:11 [INFO]: Epoch 009 - training loss: 0.6074, validation loss: 0.3169
2024-05-21 13:38:31 [INFO]: Epoch 010 - training loss: 0.6020, validation loss: 0.3166
2024-05-21 13:38:31 [INFO]: Exceeded the training patience. Terminating the training procedure...
2024-05-21 13:38:31 [INFO]: Fini

In [30]:
# the testing stage, impute the originally-missing values and artificially-missing values in the test set
brits_results = brits.predict(dataset_for_testing)
brits_imputation = brits_results["imputation"]

In [31]:
from pypots.utils.metrics import calc_mae

# calculate mean absolute error on the ground truth (artificially-missing values)
testing_mae = calc_mae(
    brits_imputation, 
    physionet2012_dataset['test_X_ori'], 
    physionet2012_dataset['test_X_indicating_mask'],
)
print(f"Testing mean absolute error: {testing_mae:.4f}")


Testing mean absolute error: 0.2583


## 🚀 An example of **M-RNN** for imputation

In [33]:
from pypots.optim import Adam
from pypots.imputation import MRNN
from pypots.utils.metrics import calc_mae

# initialize the model
# initialize the model
mrnn = MRNN(
    n_steps=physionet2012_dataset['n_steps'],
    n_features=physionet2012_dataset['n_features'],
    rnn_hidden_size=128,
    batch_size=32,
    # here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
    epochs=10,
    # here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
    # You can leave it to defualt as None to disable early stopping.
    patience=3,
    # give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
    # initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
    optimizer=Adam(lr=1e-3),
    # this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
    # Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
    # You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
    num_workers=0,
    # just leave it to default as None, PyPOTS will automatically assign the best device for you.
    # Set it as 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices, even parallelly on ['cuda:0', 'cuda:1']
    device=None,
    # set the path for saving tensorboard and trained model files
    saving_path="tutorial_results/imputation/mrnn",
    # only save the best model after training finished.
    # You can also set it as "better" to save models performing better ever during training.
    model_saving_strategy="best",
)


2024-05-21 14:09:50 [INFO]: Using the given device: cuda:0
2024-05-21 14:09:50 [INFO]: Model files will be saved to tutorial_results/imputation/mrnn/20240521_T140950
2024-05-21 14:09:50 [INFO]: Tensorboard file will be saved to tutorial_results/imputation/mrnn/20240521_T140950/tensorboard
2024-05-21 14:09:50 [INFO]: MRNN initialized with the given hyperparameters, the number of trainable parameters: 107,951


In [34]:
# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
mrnn.fit(train_set=dataset_for_training, val_set=dataset_for_validating)


2024-05-21 14:10:33 [INFO]: Epoch 001 - training loss: 0.7347, validation loss: 0.9286
2024-05-21 14:10:57 [INFO]: Epoch 002 - training loss: 0.5278, validation loss: 0.8844
2024-05-21 14:11:18 [INFO]: Epoch 003 - training loss: 0.4864, validation loss: 0.8717
2024-05-21 14:11:42 [INFO]: Epoch 004 - training loss: 0.4697, validation loss: 0.8623
2024-05-21 14:12:12 [INFO]: Epoch 005 - training loss: 0.4615, validation loss: 0.8596
2024-05-21 14:12:41 [INFO]: Epoch 006 - training loss: 0.4405, validation loss: 0.8583
2024-05-21 14:13:01 [INFO]: Epoch 007 - training loss: 0.4361, validation loss: 0.8600
2024-05-21 14:13:23 [INFO]: Epoch 008 - training loss: 0.4315, validation loss: 0.8633
2024-05-21 14:13:45 [INFO]: Epoch 009 - training loss: 0.4296, validation loss: 0.8660
2024-05-21 14:13:45 [INFO]: Exceeded the training patience. Terminating the training procedure...
2024-05-21 14:13:45 [INFO]: Finished training. The best model is from epoch#6.
2024-05-21 14:13:45 [INFO]: Saved the mo

In [35]:
# the testing stage, impute the originally-missing values and artificially-missing values in the test set
mrnn_results = mrnn.predict(dataset_for_testing)
mrnn_imputation = mrnn_results["imputation"]

In [36]:
from pypots.utils.metrics import calc_mae

# calculate mean absolute error on the ground truth (artificially-missing values)
testing_mae = calc_mae(
    mrnn_imputation,
    physionet2012_dataset['test_X_ori'],
    physionet2012_dataset['test_X_indicating_mask'],
)
print(f"Testing mean absolute error: {testing_mae:.4f}")


Testing mean absolute error: 0.6776


## 🚀 An example of **LOCF** for imputation

In [37]:
from pypots.imputation import LOCF

# initialize the model
locf = LOCF()


2024-05-21 14:13:51 [INFO]: No given device, using default device: cuda


In [38]:
# LOCF doesn't need to be trained, just call the impute() function

locf.fit(train_set=dataset_for_training, val_set=dataset_for_validating)




In [39]:
# the testing stage, impute the originally-missing values and artificially-missing values in the test set
locf_results = locf.predict(dataset_for_testing)
locf_imputation = locf_results["imputation"]

In [40]:
from pypots.utils.metrics import calc_mae

# calculate mean absolute error on the ground truth (artificially-missing values)
testing_mae = calc_mae(
    locf_imputation,
    physionet2012_dataset['test_X_ori'],
    physionet2012_dataset['test_X_indicating_mask'],
)
print(f"Testing mean absolute error: {testing_mae:.4f}")


Testing mean absolute error: 0.4091
