## Demo UCI experiments 

This notebook is intended to present some of our results on the UCI dataset.
The source code for our project can be found in the /code/ folder.
In this notebook, we train a not_MIWAE model on the UCI Breast cancer dataset.


#### First type of missing data
In this first part, we tackle a new type of missing data, that was not presented in the original paper. We introduce missing values by removing the most extreme values of the dataset in half of the features. We remove the values located in the first and last quartile.



In [48]:
# imports
import torch
import numpy as np
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader
import datetime
from code.common.data_imputation import compute_imputation_rmse_not_miwae
from code.code_UCI_experiments.not_miwae import notMIWAE, get_notMIWAE
from code.code_UCI_experiments.train import train_notMIWAE
from sklearn.model_selection import train_test_split
import logging
from code.code_UCI_experiments.introduce_missing_data import introduce_missing_mean_values, introduce_missing_extreme_values

In [49]:
# Loading data
data = np.array(pd.read_csv('datasets/cancer-dataset/Cancer_Data.csv', low_memory=False, sep=','))
X_data = data[:, 2:-2]  # Features

In [50]:
# Split data into train and validation sets
Xtrain, Xval = train_test_split(X_data, test_size=0.2, random_state=42, shuffle=True)

# Compute mean and standard deviation from the training set only
mean_train  = np.mean(Xtrain.astype(np.float64), axis=0)
std_train   = np.std(Xtrain.astype(np.float64), axis=0)

# Standardize the training set using its mean and std
Xtrain      = (Xtrain - mean_train) / std_train
total_samples_x_train = Xtrain.shape[0]
# Standardize the validation set using the training set's mean and std
Xval = (Xval - mean_train) / std_train

# Introduce missing data to features
Xnan_train, Xz_train    = introduce_missing_extreme_values(Xtrain)
Xnan_val, Xz_val        = introduce_missing_extreme_values(Xval)

Introducing missing data via removing extreme values
Introducing missing data via removing extreme values


In [38]:
# Create missing data masks (1 if present, 0 if missing)
Strain  = torch.tensor(~np.isnan(Xnan_train), dtype=torch.float32)
Sval    = torch.tensor(~np.isnan(Xnan_val), dtype=torch.float32)

Xtrain  = Xtrain.astype(np.float32)
Xval    = Xval.astype(np.float32)

# Convert features and target to PyTorch tensors
Xnan_train  = torch.tensor(Xnan_train, dtype=torch.float32)
Xnan_val    = torch.tensor(Xnan_val, dtype=torch.float32)
Xtrain      = torch.tensor(Xtrain, dtype= torch.float32)
Xval        = torch.tensor(Xval, dtype= torch.float32)

# Replace missing values (NaN) with zeros for training
Xnan_train[torch.isnan(Xnan_train)]     = 0
Xnan_val[torch.isnan(Xnan_val)]         = 0

# Prepare TensorDatasets and DataLoaders for features (X), mask (S), and target (y)
train_dataset   = TensorDataset(Xnan_train, Strain, Xtrain) # Features, mask, true features
val_dataset     = TensorDataset(Xnan_val, Sval, Xval)

device = "cuda" if torch.cuda.is_available() else 'cpu'


In [26]:

calib_config = [
                {'model': 'not_miwae', 'dataset_name':  'cancer', 'lr': 1e-4, 'epochs' : 100, 'pct_start': 0.2, 'final_div_factor': 1e4, 'batch_size': 16, 'n_hidden': 128, 'n_latent': 28, 'missing_process':'linear', 'weight_decay': 0, 'betas': (0.9, 0.999), 'random_seed': 0, 'out_dist': 'gauss'},

                ][-1]

In [27]:
# Create DataLoaders
train_loader    = DataLoader(train_dataset, batch_size=calib_config['batch_size'], shuffle=True)
val_loader      = DataLoader(val_dataset, batch_size=calib_config['batch_size'], shuffle=False)

In [28]:
model = notMIWAE(n_input_features=Xtrain.shape[1], n_hidden=calib_config['n_hidden'], n_latent = calib_config['n_latent'], missing_process = calib_config['missing_process'], out_dist=calib_config['out_dist'])


Number of parameters in the BernouilliDecoderMiss: 870


In [29]:
model.to(device)
date = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
print(f"Training timestamp : {date}")
print(f"Number of parameters in the model: {sum (p.numel() if p.requires_grad else 0 for p in model.parameters()) }")
optimizer = torch.optim.Adam(model.parameters(), lr=calib_config['lr'], weight_decay=calib_config['weight_decay'], betas=calib_config['betas'])
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer,
                                                max_lr = calib_config['lr'],
                                                epochs = calib_config['epochs'],
                                                steps_per_epoch= len(train_loader),
                                                pct_start= calib_config['pct_start'],
                                                )
print(f"calib_config:{calib_config}")
logging.info("Starting training")
if calib_config['model'] == 'not_miwae':
    train_notMIWAE(model, train_loader=train_loader, val_loader=val_loader, optimizer=optimizer, scheduler = scheduler, num_epochs = calib_config['epochs'], total_samples_x_train=total_samples_x_train, device = device, date=date)
    torch.save(model.state_dict(), f"temp/not_miwae_{date}_last_epoch.pt")

Training timestamp : 2024_12_11_14_15_00
Number of parameters in the model: 56152
calib_config:{'model': 'not_miwae', 'dataset_name': 'cancer', 'lr': 0.0001, 'epochs': 100, 'pct_start': 0.2, 'final_div_factor': 10000.0, 'batch_size': 16, 'n_hidden': 128, 'n_latent': 28, 'missing_process': 'linear', 'weight_decay': 0, 'betas': (0.9, 0.999), 'random_seed': 0, 'out_dist': 'gauss'}
Epoch    1, Train Loss:  44.1113 , Train rmse:  1.7057 , Val Loss:  46.0794 , Val RMSE:  1.5257  last value of lr: 0.0000
Epoch    2, Train Loss:  44.1120 , Train rmse:  1.6841 , Val Loss:  45.9932 , Val RMSE:  1.5830  last value of lr: 0.0000
Epoch    3, Train Loss:  44.3436 , Train rmse:  1.6801 , Val Loss:  45.6282 , Val RMSE:  1.5767  last value of lr: 0.0000
Epoch    4, Train Loss:  44.1224 , Train rmse:  1.6198 , Val Loss:  45.8035 , Val RMSE:  1.5873  last value of lr: 0.0000
Epoch    5, Train Loss:  44.1230 , Train rmse:  1.6653 , Val Loss:  45.5128 , Val RMSE:  1.5763  last value of lr: 0.0000
Epoch    

In [30]:
# load the best model
model.load_state_dict(torch.load(f"temp/not_miwae_{date}_best_val_loss.pt", weights_only=True))

<All keys matched successfully>

In [31]:
# get best validation accuracy
model.eval()
val_loss = 0
val_rmse = 0
with torch.no_grad():
    for x, s, xtrue in val_loader:
        x, s, xtrue = x.to(device), s.to(device), xtrue.to(device)
        mu, lpxz, lpmz, lqzx, lpz   = model(x, s, total_samples_x_train)
        loss            = -get_notMIWAE(total_samples_x_train, lpxz, lpmz, lqzx, lpz)
        val_loss        += loss.item()
        batch_rmse      = compute_imputation_rmse_not_miwae(mu, lpxz, lpmz, lqzx, lpz, xtrue, s)
        val_rmse        += batch_rmse
val_loss /= len(val_loader)
val_rmse /= len(val_loader)
print(f"Val RMSE: {val_rmse}")

Val RMSE: 1.5422841310501099


#### Another type of missing data
In this new type of missing data, we introduce missing not at random data by removing values on the first $int(n_{features}/2)$ columns of the dataset. For each of these columns, 30% of the values are removed. The removed values are the 30% closest one to the mean using the L1 norm.

In [44]:
# Split data into train and validation sets
Xtrain, Xval = train_test_split(X_data, test_size=0.2, random_state=42, shuffle=True)

# Compute mean and standard deviation from the training set only
mean_train  = np.mean(Xtrain.astype(np.float64), axis=0)
std_train   = np.std(Xtrain.astype(np.float64), axis=0)

# Standardize the training set using its mean and std
Xtrain      = (Xtrain - mean_train) / std_train
total_samples_x_train = Xtrain.shape[0]
# Standardize the validation set using the training set's mean and std
Xval = (Xval - mean_train) / std_train

# Introduce missing data to features
Xnan_train, Xz_train    = introduce_missing_mean_values(Xtrain)
Xnan_val, Xz_val        = introduce_missing_mean_values(Xval)
# Create missing data masks (1 if present, 0 if missing)
Strain  = torch.tensor(~np.isnan(Xnan_train), dtype=torch.float32)
Sval    = torch.tensor(~np.isnan(Xnan_val), dtype=torch.float32)

Xtrain  = Xtrain.astype(np.float32)
Xval    = Xval.astype(np.float32)

# Convert features and target to PyTorch tensors
Xnan_train  = torch.tensor(Xnan_train, dtype=torch.float32)
Xnan_val    = torch.tensor(Xnan_val, dtype=torch.float32)
Xtrain      = torch.tensor(Xtrain, dtype= torch.float32)
Xval        = torch.tensor(Xval, dtype= torch.float32)

# Replace missing values (NaN) with zeros for training
Xnan_train[torch.isnan(Xnan_train)]     = 0
Xnan_val[torch.isnan(Xnan_val)]         = 0

# Prepare TensorDatasets and DataLoaders for features (Xnan_train), mask (S), and true values (Xtrain
train_dataset   = TensorDataset(Xnan_train, Strain, Xtrain) # Features, mask, true features
val_dataset     = TensorDataset(Xnan_val, Sval, Xval)

device = "cuda" if torch.cuda.is_available() else 'cpu'
# Create DataLoaders
train_loader    = DataLoader(train_dataset, batch_size=calib_config['batch_size'], shuffle=True)
val_loader      = DataLoader(val_dataset, batch_size=calib_config['batch_size'], shuffle=False)


Introduce missing data by removing the values around the mean
Introduce missing data by removing the values around the mean


In [45]:
model = notMIWAE(n_input_features=Xtrain.shape[1], n_hidden=calib_config['n_hidden'], n_latent = calib_config['n_latent'], missing_process = calib_config['missing_process'], out_dist=calib_config['out_dist'])


Number of parameters in the BernouilliDecoderMiss: 870


In [46]:
model.to(device)
date = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
print(f"Training timestamp : {date}")
print(f"Number of parameters in the model: {sum (p.numel() if p.requires_grad else 0 for p in model.parameters()) }")
optimizer = torch.optim.Adam(model.parameters(), lr=calib_config['lr'], weight_decay=calib_config['weight_decay'], betas=calib_config['betas'])
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer,
                                                max_lr = calib_config['lr'],
                                                epochs = calib_config['epochs'],
                                                steps_per_epoch= len(train_loader),
                                                pct_start= calib_config['pct_start'],
                                                )
print(f"calib_config:{calib_config}")
logging.info("Starting training")
if calib_config['model'] == 'not_miwae':
    train_notMIWAE(model, train_loader=train_loader, val_loader=val_loader, optimizer=optimizer, scheduler = scheduler, num_epochs = calib_config['epochs'], total_samples_x_train=total_samples_x_train, device = device, date=date)
    torch.save(model.state_dict(), f"temp/not_miwae_{date}_last_epoch.pt")

Training timestamp : 2024_12_11_14_33_07
Number of parameters in the model: 56152
calib_config:{'model': 'not_miwae', 'dataset_name': 'cancer', 'lr': 0.0001, 'epochs': 100, 'pct_start': 0.2, 'final_div_factor': 10000.0, 'batch_size': 16, 'n_hidden': 128, 'n_latent': 28, 'missing_process': 'linear', 'weight_decay': 0, 'betas': (0.9, 0.999), 'random_seed': 0, 'out_dist': 'gauss'}
Epoch    1, Train Loss:  46.2630 , Train rmse:  1.6848 , Val Loss:  47.8207 , Val RMSE:  1.4676  last value of lr: 0.0000
Epoch    2, Train Loss:  45.2744 , Train rmse:  1.7427 , Val Loss:  47.5256 , Val RMSE:  1.6143  last value of lr: 0.0000
Epoch    3, Train Loss:  45.7471 , Train rmse:  1.6883 , Val Loss:  47.6082 , Val RMSE:  1.6952  last value of lr: 0.0000
Epoch    4, Train Loss:  46.1216 , Train rmse:  1.6651 , Val Loss:  47.5352 , Val RMSE:  1.5348  last value of lr: 0.0000
Epoch    5, Train Loss:  45.7331 , Train rmse:  1.6859 , Val Loss:  47.9459 , Val RMSE:  1.5873  last value of lr: 0.0000
Epoch    

In [47]:
model.load_state_dict(torch.load(f"temp/not_miwae_{date}_best_val_loss.pt", weights_only=True))
# get best validation accuracy
model.eval()
val_loss = 0
val_rmse = 0
with torch.no_grad():
    for x, s, xtrue in val_loader:
        x, s, xtrue = x.to(device), s.to(device), xtrue.to(device)
        mu, lpxz, lpmz, lqzx, lpz   = model(x, s, total_samples_x_train)
        loss            = -get_notMIWAE(total_samples_x_train, lpxz, lpmz, lqzx, lpz)
        val_loss        += loss.item()
        batch_rmse      = compute_imputation_rmse_not_miwae(mu, lpxz, lpmz, lqzx, lpz, xtrue, s)
        val_rmse        += batch_rmse
val_loss /= len(val_loader)
val_rmse /= len(val_loader)
print(f"Val RMSE: {val_rmse}")

Val RMSE: 1.0372258424758911
