# hivae Example

A general example for trying out three different datasets

Follows the same structure as the python file - here just in a notebook

In [1]:
# issues with : UserWarning: `tf.layers.dense` is deprecated and will be removed in a future version. Please use `tf.keras.layers.Dense` instead.

import warnings
warnings.simplefilter(action='once', category=UserWarning) 

In [2]:
from IPython.display import display, HTML
display(HTML("<style>.container {width:80% !important;</style>"))

In [3]:
import os,os.path

import pprint
printer = pprint.PrettyPrinter(depth=3).pprint


In [4]:
import os,os.path

import pandas as pd
import hivae
import numpy as np
import scipy.stats


In [22]:
main_directory = '.'

# select which data to run for this example
#dataset_name = 'Diabetes'
dataset_name = 'Adult'
#dataset_name = 'Mock'


In [23]:
# set up paths
# where the data should be found
dataset_path = '{}/data/{}'.format(main_directory,dataset_name)
# where the results will be saved
results_path = '{}/results/{}'.format(main_directory,dataset_name)
# where the networks will be saved
network_path = '{}/network/{}'.format(main_directory,dataset_name)


In [24]:
# generate information for different datasets
types_list_d = {} # empty hash 
#Diabetes
types_list_d['Diabetes'] = [
    ('AGE','pos',1,None), # positive numerical , use one feature
    ('SEX','cat',2,2),    # categorical - has two values , use two features
    ('BMI','pos',1,None),
    ('BP','pos',1,None),
    ('S1','pos',1,None),
    ('S2','pos',1,None),
    ('S3','pos',1,None),
    ('S4','pos',1,None),
    ('S5','pos',1,None),
    ('S6','pos',1,None),
#    ('Y','pos',1,None) not included in the data 
    ]

# Adult
types_list_d['Adult'] = [
    ('V1','count',1,None),  # count values, use one feature
    ('V2','cat',7,7),
    ('V3','pos',1,None),
    ('V4','ordinal',16,16),
    ('V5','cat',7,7),
    ('V6','cat',14,14),
    ('V7','cat',6,6),
    ('V8','cat',5,5),
    ('V9','cat',2,2),
    ('V10','pos',1,None),
    ('V11','pos',1,None),
    ('V12','count',1,None)  # count values, use one feature
    ]

types_list_d['Mock'] = [
    ('V1','count',1,None),
    ('V2','cat',2,2),
    ('V3','ordinal',2,2),
    ('V4','pos',1,None),
    ]
    


In [25]:
# use for the selected dataset 
types_list = types_list_d[dataset_name]

# assumes that in dataset_path (e.g ./data/Adult)
data_file_org = '{}/data_org.csv'.format(dataset_path)
data_file     = '{}/data.csv'.format(dataset_path) 
data_file_id  = '{}/data_id.csv'.format(dataset_path)
train_file    = '{}/data_train.csv'.format(dataset_path) # has to exits!!!!!
test_file     = '{}/data_test.csv'.format(dataset_path)

train_data = None # will contain the pandas dataframe for training
test_data  = None # will contain the pandas dataframe for testing


In [26]:
# if train/test does not exist - create from new
data_df = None
if not os.path.exists(train_file):
    data_df = pd.read_csv(data_file_org,header=None)
    data_df.insert(0,'ID',['{:s}'.format(str(x).zfill(3)) for x in range(len(data_df))])
    print('Len data = ',len(data_df))
    # save data including IDs 
    data_df.to_csv(data_file_id,header=False,index=False)
    # drop IDs again
    data_df = data_df.drop('ID', axis=1)
    data_df.to_csv(data_file,header=False,index=False)

    test_data = data_df.sample(frac=0.3,replace=False,random_state=33)
    train_data = data_df.loc[set(data_df.index)-set(test_data.index)]

    train_data.to_csv(train_file,header=False,index=False)
    test_data.to_csv(test_file,header=False,index=False)
#if not os.path.exists(train_file):
else:
    train_data = pd.read_csv(train_file,header=None)
    test_data  = pd.read_csv(test_file,header=None)


In [27]:
# construct missing data mask - for data having missing data
missing_true_train = pd.DataFrame()
for x in list(train_data.columns.values):
    missing_true_train[x] = train_data[x].isna().map({True:0,False:1})


In [28]:

missing_true_test = pd.DataFrame()
for x in list(test_data.columns.values):
    missing_true_test  =  test_data[x].isna().map({True:0,False:1})



In [31]:
train_data[3].value_counts()

9     7361
10    5025
13    3783
14    1181
11     960
7      833
12     760
6      675
4      476
15     402
5      366
8      313
16     272
3      236
2      118
1       32
Name: 3, dtype: int64

In [13]:
network_dict = {
    'Diabetes':{
        'batch_size' : 32,
        'model_name': 'model_HIVAE_inputDropout',
        'dim_z': 5,
        'dim_y': 5,
        'dim_s': 10,
    },
    'Adult':{
        'batch_size' : 32,
        'model_name': 'model_HIVAE_inputDropout',
        'dim_z': 8,
        'dim_y': 10,
        'dim_s': 10,
    },
    'Mock':{
        'batch_size' : 32,
        'model_name': 'model_HIVAE_inputDropout',
        'dim_z': 3,
        'dim_y': 3,
        'dim_s': 3,
    }
}

iterations = {
    'Diabetes':50,
    'Adult':10,
    'Mock':10
}

In [14]:
hivae_obj = hivae.hivae(types_list,network_dict[dataset_name],results_path=results_path,network_path=network_path)


self.full_network_path ./network/Diabetes/model_HIVAE_inputDropout_s10_z5_y5_batch32_dc745f86-39c1-446e-9fd1-c89155fdba83
self.full_results_path ./results/Diabetes/model_HIVAE_inputDropout_s10_z5_y5_batch32_dc745f86-39c1-446e-9fd1-c89155fdba83
self.network_file_name ./network/Diabetes/model_HIVAE_inputDropout_s10_z5_y5_batch32_dc745f86-39c1-446e-9fd1-c89155fdba83_ckpt


In [15]:
network_path

'./network/Diabetes'

In [16]:
results_path

'./results/Diabetes'

In [17]:
hivae_obj.fit(train_data,epochs=iterations[dataset_name],true_missing_mask=missing_true_train)


[*] Importing model: model_HIVAE_inputDropout
[*] Defining placeholders
[*] Defining Encoder...
[*] Defining Decoder...


  log_pi = tf.compat.v1.layers.dense(inputs=X, units=s_dim, activation=None,
  samples['y'] = tf.compat.v1.layers.dense(inputs=samples['z'], units=y_dim, activation=None,


[*] Defining Cost function...
Metal device set to: Apple M1
INFO :	 Training the HVAE ...
INFO :	 Initizalizing Variables ...


2022-08-25 14:54:00.046249: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-08-25 14:54:00.046374: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2022-08-25 14:54:00.114839: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:354] MLIR V1 optimization pass is not enabled
2022-08-25 14:54:00.144624: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-08-25 14:54:00.145490: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-25 14:54:00.286030: I tensorflow/core/grappler/optimizers/custom_graph

INFO :	 Training Finished ...
INFO :	 Saving model_HIVAE_inputDropout_s10_z5_y5_batch32_dc745f86-39c1-446e-9fd1-c89155fdba83_loglik.csv in ./results/Diabetes/model_HIVAE_inputDropout_s10_z5_y5_batch32_dc745f86-39c1-446e-9fd1-c89155fdba83
INFO :	 Saving model_HIVAE_inputDropout_s10_z5_y5_batch32_dc745f86-39c1-446e-9fd1-c89155fdba83_KL_s.csv in ./results/Diabetes/model_HIVAE_inputDropout_s10_z5_y5_batch32_dc745f86-39c1-446e-9fd1-c89155fdba83
INFO :	 Saving model_HIVAE_inputDropout_s10_z5_y5_batch32_dc745f86-39c1-446e-9fd1-c89155fdba83_KL_z.csv in ./results/Diabetes/model_HIVAE_inputDropout_s10_z5_y5_batch32_dc745f86-39c1-446e-9fd1-c89155fdba83
INFO :	 Saving model_HIVAE_inputDropout_s10_z5_y5_batch32_dc745f86-39c1-446e-9fd1-c89155fdba83_train_error.csv in ./results/Diabetes/model_HIVAE_inputDropout_s10_z5_y5_batch32_dc745f86-39c1-446e-9fd1-c89155fdba83
INFO :	 Saving model_HIVAE_inputDropout_s10_z5_y5_batch32_dc745f86-39c1-446e-9fd1-c89155fdba83_test_error.csv in ./results/Diabetes/model

2022-08-25 14:54:53.398253: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


INFO:tensorflow:./network/Diabetes/model_HIVAE_inputDropout_s10_z5_y5_batch32_dc745f86-39c1-446e-9fd1-c89155fdba83_ckpt.index
INFO:tensorflow:0
INFO:tensorflow:./network/Diabetes/model_HIVAE_inputDropout_s10_z5_y5_batch32_dc745f86-39c1-446e-9fd1-c89155fdba83_ckpt.data-00000-of-00001
INFO:tensorflow:0
INFO:tensorflow:./network/Diabetes/model_HIVAE_inputDropout_s10_z5_y5_batch32_dc745f86-39c1-446e-9fd1-c89155fdba83_ckpt.meta
INFO:tensorflow:1100


In [18]:
(test_data, test_data_reconstructed, test_data_decoded, test_data_embedded_z, test_data_embedded_s) = hivae_obj.predict(test_data,true_missing_mask=missing_true_test)


[*] Importing model: model_HIVAE_inputDropout
[*] Defining placeholders
[*] Defining Encoder...
[*] Defining Decoder...


  log_pi = tf.compat.v1.layers.dense(inputs=X, units=s_dim, activation=None,
  samples['y'] = tf.compat.v1.layers.dense(inputs=samples['z'], units=y_dim, activation=None,


[*] Defining Cost function...
INFO :	 Testing the HVAE ...
INFO:tensorflow:Restoring parameters from ./network/Diabetes/model_HIVAE_inputDropout_s10_z5_y5_batch32_dc745f86-39c1-446e-9fd1-c89155fdba83_ckpt


2022-08-25 14:55:00.969495: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-08-25 14:55:00.969769: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2022-08-25 14:55:01.210879: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-08-25 14:55:01.326896: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


INFO :	 Model restored.


2022-08-25 14:55:02.769448: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


INFO :	 Testing Finished ...
INFO :	 Saving model_HIVAE_inputDropout_s10_z5_y5_batch32_dc745f86-39c1-446e-9fd1-c89155fdba83_data_reconstruction.csv in ./results/Diabetes/model_HIVAE_inputDropout_s10_z5_y5_batch32_dc745f86-39c1-446e-9fd1-c89155fdba83
INFO :	 Saving model_HIVAE_inputDropout_s10_z5_y5_batch32_dc745f86-39c1-446e-9fd1-c89155fdba83_data_true.csv in ./results/Diabetes/model_HIVAE_inputDropout_s10_z5_y5_batch32_dc745f86-39c1-446e-9fd1-c89155fdba83
INFO :	 Saving model_HIVAE_inputDropout_s10_z5_y5_batch32_dc745f86-39c1-446e-9fd1-c89155fdba83_data_loglik_mean_reconstructed.csv in ./results/Diabetes/model_HIVAE_inputDropout_s10_z5_y5_batch32_dc745f86-39c1-446e-9fd1-c89155fdba83


In [19]:
df_test_data               = pd.DataFrame(test_data)
df_test_data_reconstructed = pd.DataFrame(test_data_reconstructed)
df_test_data_decoded       = pd.DataFrame(test_data_decoded)
df_test_data_embedded_z    = pd.DataFrame(test_data_embedded_z)
df_test_data_embedded_s    = pd.DataFrame(test_data_embedded_s)


In [20]:
df_test_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,61.0,1.0,28.2,106.00,204.0,132.0,52.0,4.00,4.6052,96.0
1,61.0,1.0,23.1,113.00,186.0,114.4,47.0,4.00,4.8122,105.0
2,26.0,1.0,30.3,89.00,218.0,152.2,31.0,7.00,5.1591,82.0
3,34.0,1.0,30.0,83.00,185.0,107.2,53.0,3.00,4.8203,92.0
4,58.0,0.0,25.7,99.00,157.0,91.6,49.0,3.00,4.4067,93.0
...,...,...,...,...,...,...,...,...,...,...
128,50.0,1.0,29.6,94.33,300.0,242.4,33.0,9.09,4.8122,109.0
129,41.0,1.0,25.7,83.00,181.0,106.6,66.0,3.00,3.7377,85.0
130,66.0,1.0,26.0,91.00,264.0,146.6,65.0,4.00,5.5683,87.0
131,79.0,1.0,23.3,88.00,186.0,128.4,33.0,6.00,4.8122,102.0


In [21]:
df_test_data_reconstructed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,61.0,1.0,28.2,106.00,204.0,132.0,52.0,4.00,4.6052,96.0
1,61.0,1.0,23.1,113.00,186.0,114.4,47.0,4.00,4.8122,105.0
2,26.0,1.0,30.3,89.00,218.0,152.2,31.0,7.00,5.1591,82.0
3,34.0,1.0,30.0,83.00,185.0,107.2,53.0,3.00,4.8203,92.0
4,58.0,0.0,25.7,99.00,157.0,91.6,49.0,3.00,4.4067,93.0
...,...,...,...,...,...,...,...,...,...,...
128,50.0,1.0,29.6,94.33,300.0,242.4,33.0,9.09,4.8122,109.0
129,41.0,1.0,25.7,83.00,181.0,106.6,66.0,3.00,3.7377,85.0
130,66.0,1.0,26.0,91.00,264.0,146.6,65.0,4.00,5.5683,87.0
131,79.0,1.0,23.3,88.00,186.0,128.4,33.0,6.00,4.8122,102.0
