In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

from lmmvae.dim_reduction_images import run_dim_reduction_images
from lmmvae.simulation import Count

In [3]:
images_df = pd.read_csv('../../data/celeba_small.csv')

images_df.head()

Unnamed: 0,img_file,lefteye_x,lefteye_y,righteye_x,righteye_y,nose_x,nose_y,leftmouth_x,leftmouth_y,rightmouth_x,rightmouth_y,celeb_orig,celeb
0,000001.png,69,109,106,113,77,142,73,152,108,154,2880,1540
1,000002.png,69,110,107,112,81,135,70,151,108,153,2937,1573
2,000003.png,76,112,104,106,108,128,74,156,98,158,8692,4689
3,000004.png,72,113,108,108,101,138,71,155,101,151,5805,3128
4,000005.png,66,114,112,112,86,119,71,147,104,150,9295,5005


In [4]:
from PIL import Image

def read_image(img_file, height=72, width=60):
    img = Image.open(img_file)
    img = img.resize((width, height))
    img = np.asarray(img, dtype=np.float32) / 255.
    return img

In [5]:
images = []
img_path = '../../data/img_align_celeba_png/'
for img_file in images_df['img_file']:
    # images.append(plt.imread(img_path + img_file))
    images.append(read_image(img_path +  img_file))

X = np.array(images)

RE_cols = ['celeb']
Z = images_df[RE_cols].values

print(X.shape) # (10000, 72, 60, 3)
print(Z.shape) # (10000, 1)
del images

(10000, 72, 60, 3)
(10000, 1)


In [6]:
from pympler import asizeof
asizeof.asizeof(X)

518400176

In [7]:
n_cats_celebs = len(images_df['celeb'].unique())
print(f'no. of sources: {n_cats_celebs}')

no. of sources: 5429


In [8]:
# params for LMMVAE and other methods, some unnecessary for current use-case therefore are none
img_height, img_width, channels = X.shape[1:]
mode = 'categorical'
n_sig2bs = 1
n_sig2bs_spatial = 0
n_neurons = [32, 16]
dropout = None
activation = 'relu'
RE_cols_prefix = 'z'
thresh = None
epochs = 200
qs = [n_cats_celebs]
q_spatial = None
batch_size = 1000
patience = None
kernel_root = None
U = None
B_list = None
est_cors = []
n_neurons_re = n_neurons
pred_unknown_clusters = False
max_spatial_locs = 100
time2measure_dict = None

In [9]:
res = pd.DataFrame(columns=['d', 'beta', 're_prior', 'experiment', 'exp_type', 'mse_X', 'sigma_b0_est', 'n_epoch', 'time',
    'total_loss_tr', 'recon_loss_tr', 'kl_loss_tr', 're_kl_loss_tr', 'total_loss_te', 'recon_loss_te', 'kl_loss_te', 're_kl_loss_te'
    ])
kf = KFold(n_splits=5, shuffle=True, random_state=40)
counter = Count().gen()

In [10]:
def iterate_reg_types(X_train, X_test, Z_train, Z_test, counter, d, beta, re_prior, i, verbose):
    mse_lmmvae, sigmas, _, n_epochs_lmmvae, time_lmmvae, losses_lmmvae = run_dim_reduction_images(X_train, X_test, Z_train, Z_test,
            img_height, img_width, channels, d, 'lmmvae',
            thresh, epochs, qs, q_spatial, n_sig2bs, n_sig2bs_spatial, est_cors, batch_size, patience, n_neurons, n_neurons_re, dropout,
            activation, mode, beta, re_prior, kernel_root, pred_unknown_clusters, max_spatial_locs, time2measure_dict, verbose, U, B_list)
    print('   finished lmmvae, mse: %.3f' % mse_lmmvae)
    mse_ig, _, _, n_epochs_ig, time_ig, losses_ig = run_dim_reduction_images(X_train, X_test, Z_train, Z_test,
            img_height, img_width, channels, d, 'pca-ignore',
            thresh, epochs, qs, q_spatial, n_sig2bs, n_sig2bs_spatial, est_cors, batch_size, patience, n_neurons, n_neurons_re, dropout,
            activation, mode, beta, re_prior, kernel_root, pred_unknown_clusters, max_spatial_locs, time2measure_dict, verbose, U, B_list)
    print('   finished pca-ignore, mse: %.3f' % mse_ig)
    mse_ohe, _, _, n_epochs_ohe, time_ohe, losses_ohe = run_dim_reduction_images(X_train, X_test, Z_train, Z_test,
            img_height, img_width, channels, d, 'pca-ohe',
            thresh, epochs, qs, q_spatial, n_sig2bs, n_sig2bs_spatial, est_cors, batch_size, patience, n_neurons, n_neurons_re, dropout,
            activation, mode, beta, re_prior, kernel_root, pred_unknown_clusters, max_spatial_locs, time2measure_dict, verbose, U, B_list)
    print('   finished pca-ohe, mse: %.3f' % mse_ohe)
    mse_vaeig, _, _, n_epochs_vaeig, time_vaeig, losses_vaeig = run_dim_reduction_images(X_train, X_test, Z_train, Z_test,
            img_height, img_width, channels, d, 'vae-ignore',
            thresh, epochs, qs, q_spatial, n_sig2bs, n_sig2bs_spatial, est_cors, batch_size, patience, n_neurons, n_neurons_re, dropout,
            activation, mode, beta, re_prior, kernel_root, pred_unknown_clusters, max_spatial_locs, time2measure_dict, verbose, U, B_list)
    print('   finished vae-ignore, mse: %.3f' % mse_vaeig)
    mse_vaeem, _, _, n_epochs_vaeem, time_vaeem, losses_vaeem = run_dim_reduction_images(X_train, X_test, Z_train, Z_test,
            img_height, img_width, channels, d, 'vae-embed',
            thresh, epochs, qs, q_spatial, n_sig2bs, n_sig2bs_spatial, est_cors, batch_size, patience, n_neurons, n_neurons_re, dropout,
            activation, mode, beta, re_prior, kernel_root, pred_unknown_clusters, max_spatial_locs, time2measure_dict, verbose, U, B_list)
    print('   finished vae-embed, mse: %.3f' % mse_vaeem)
    res.loc[next(counter)] = [d, beta, re_prior, i, 'lmmvae', mse_lmmvae, sigmas[1][0], n_epochs_lmmvae, time_lmmvae] + losses_lmmvae
    res.loc[next(counter)] = [d, beta, re_prior, i, 'pca-ignore', mse_ig, np.nan, n_epochs_ig, time_ig] + losses_ig
    res.loc[next(counter)] = [d, beta, re_prior, i, 'pca-ohe', mse_ohe, np.nan, n_epochs_ohe, time_ohe] + losses_ohe
    res.loc[next(counter)] = [d, beta, re_prior, i, 'vae-ignore', mse_vaeig, np.nan, n_epochs_vaeig, time_vaeig] + losses_vaeig
    res.loc[next(counter)] = [d, beta, re_prior, i, 'vae-embed', mse_vaeem, np.nan, n_epochs_vaeem, time_vaeem] + losses_vaeem
    res.to_csv('res_celeba.csv')

In [11]:
betas = [0.01]
ds = [100, 200, 500]
re_priors = [0.001]

In [13]:
for beta in betas:
  for d in ds:
    for re_prior in re_priors:
      print(f'beta: {beta}, d: {d}, re_prior: {re_prior}:')
      if pred_unknown_clusters:
        for i, (train_samp_subj, test_samp_subj) in enumerate(kf.split(range(n_cats_celebs))):
          print('  iteration %d' % i)
          train_index = images_df[images_df['celeb'].isin(train_samp_subj)].index.values
          test_index = images_df[images_df['celeb'].isin(test_samp_subj)].index.values
          X_train, X_test, Z_train, Z_test = X[train_index].copy(), X[test_index].copy(), Z[train_index].copy(), Z[test_index].copy()
          iterate_reg_types(X_train, X_test, Z_train, Z_test, counter, d, beta, re_prior, i, verbose=True)
      else:
        for i, (train_index, test_index) in enumerate(kf.split(X)):
          print('  iteration %d' % i)
          X_train, X_test, Z_train, Z_test = X[train_index].copy(), X[test_index].copy(), Z[train_index].copy(), Z[test_index].copy()
          iterate_reg_types(X_train, X_test, Z_train, Z_test, counter, d, beta, re_prior, i, verbose=True)

beta: 0.01, d: 100, re_prior: 0.001:
  iteration 0
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


KeyboardInterrupt: 