In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler

from lmmvae.dim_reduction import run_dim_reduction
from lmmvae.simulation import Count

In [3]:
rossmann = pd.read_csv('../../data/rossmann_df.csv')
rossmann['Store'] = rossmann['Store'] - 1
cols_to_drop = ['date', 'year']
rossmann.drop(cols_to_drop, axis=1, inplace=True)

In [4]:
print(rossmann.shape)
rossmann.head()

(33485, 25)


Unnamed: 0,month,Store,Sales,Open,Promo,SchoolHoliday,holiday_0,holiday_a,holiday_b,holiday_c,...,Promo2SinceWeek,Promo2SinceYear,PromoInterval,storetype_a,storetype_b,storetype_c,storetype_d,storeassort_a,storeassort_b,storeassort_c
0,1,0,1.28431,26,10,11,30,1,0,0,...,22,2012,0,0,0,1,0,1,0,0
1,1,1,1.15171,26,10,4,30,1,0,0,...,13,2010,1,1,0,0,0,1,0,0
2,1,2,1.65653,26,10,4,30,1,0,0,...,14,2011,1,1,0,0,0,1,0,0
3,1,3,2.34713,26,10,4,30,1,0,0,...,22,2012,0,0,0,1,0,0,0,1
4,1,4,1.09442,26,10,2,30,1,0,0,...,22,2012,0,1,0,0,0,1,0,0


In [5]:
n_stores = len(rossmann['Store'].unique())
print(f'no. of stores: {n_stores}')

no. of stores: 1115


In [6]:
# categorical features name change
rossmann.rename(columns={'Store': 'z0', 'Sales': 'y'}, inplace=True)

In [7]:
# special features: t for longitudinal, D1 and D2 for spatial (longitude, latitude)
new_cols = [col for col in rossmann.columns if col not in ['D1', 'D2', 'z0', 't', 'y']] + ['t', 'z0']
X = rossmann[new_cols]
X.columns

Index(['month', 'Open', 'Promo', 'SchoolHoliday', 'holiday_0', 'holiday_a',
       'holiday_b', 'holiday_c', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 'storetype_a',
       'storetype_b', 'storetype_c', 'storetype_d', 'storeassort_a',
       'storeassort_b', 'storeassort_c', 't', 'z0'],
      dtype='object')

In [8]:
# final X matrix
X = X.astype(np.float64)

In [9]:
# params for LMMVAE and other methods, some unnecessary for current use-case therefore are none
mode = 'longitudinal'
n_sig2bs = 2
n_sig2bs_spatial = 0
n_neurons = [1000, 500]
dropout = None
activation = 'relu'
RE_cols_prefix = 'z'
thresh = None
epochs = 200
qs = [n_stores]
q_spatial = None
batch_size = 1000
patience = None
kernel_root = None
U = None
B_list = None
est_cors = []
n_neurons_re = n_neurons
pred_unknown_clusters = False
max_spatial_locs = 100
time2measure_dict = {t: i for i, t in enumerate(np.sort(X['t'].unique()))}
longitudinal_predict_future = False # Change for Future mode

In [10]:
res = pd.DataFrame(columns=['d', 'beta', 're_prior', 'experiment', 'exp_type', 'mse_X', 'sigma_b0_est', 'sigma_b1_est', 'n_epoch', 'time',
                            'total_loss_tr', 'recon_loss_tr', 'kl_loss_tr', 're_kl_loss_tr', 'total_loss_te',
                            'recon_loss_te', 'kl_loss_te', 're_kl_loss_te'])
kf = KFold(n_splits=5, shuffle=True, random_state=40)
counter = Count().gen()
x_cols = [col for col in X.columns if col not in ['z0']]
x_cols_to_scale = [col for col in x_cols if col not in ['D1', 'D2', 't']]

In [11]:
def iterate_reg_types(X_train, X_test, counter, d, beta, re_prior, i, verbose):
    mse_svgpvae_16_2, _, _, n_epochs_svgpvae_16_2, time_svgpvae_16_2, losses_svgpvae_16_2 = run_dim_reduction(X_train, X_test, x_cols, RE_cols_prefix, d, 'svgpvae-10-16-2',
            thresh, epochs, qs, q_spatial, n_sig2bs, n_sig2bs_spatial, est_cors, batch_size, patience, n_neurons, n_neurons_re, dropout,
            activation, mode, beta, re_prior, kernel_root, pred_unknown_clusters, max_spatial_locs, time2measure_dict, verbose, U, B_list)
    print('   finished svgpvae_16_2, mse: %.3f' % mse_svgpvae_16_2)
    mse_lmmvae, sigmas, _, n_epochs_lmmvae, time_lmmvae, losses_lmmvae = run_dim_reduction(X_train, X_test, x_cols, RE_cols_prefix, d, 'lmmvae',
            thresh, epochs, qs, q_spatial, n_sig2bs, n_sig2bs_spatial, est_cors, batch_size, patience, n_neurons, n_neurons_re, dropout,
            activation, mode, beta, re_prior, kernel_root, pred_unknown_clusters, max_spatial_locs, time2measure_dict, verbose, U, B_list)
    print('   finished lmmvae, mse: %.3f' % mse_lmmvae)
    mse_ig, _, _, n_epochs_ig, time_ig, losses_ig = run_dim_reduction(X_train, X_test, x_cols, RE_cols_prefix, d, 'pca-ignore',
            thresh, epochs, qs, q_spatial, n_sig2bs, n_sig2bs_spatial, est_cors, batch_size, patience, n_neurons, n_neurons_re, dropout,
            activation, mode, beta, re_prior, kernel_root, pred_unknown_clusters, max_spatial_locs, time2measure_dict, verbose, U, B_list)
    print('   finished pca-ignore, mse: %.3f' % mse_ig)
    mse_ohe, _, _, n_epochs_ohe, time_ohe, losses_ohe = run_dim_reduction(X_train, X_test, x_cols, RE_cols_prefix, d, 'pca-ohe',
            thresh, epochs, qs, q_spatial, n_sig2bs, n_sig2bs_spatial, est_cors, batch_size, patience, n_neurons, n_neurons_re, dropout,
            activation, mode, beta, re_prior, kernel_root, pred_unknown_clusters, max_spatial_locs, time2measure_dict, verbose, U, B_list)
    print('   finished pca-ohe, mse: %.3f' % mse_ohe)
    mse_vaeig, _, _, n_epochs_vaeig, time_vaeig, losses_vaeig = run_dim_reduction(X_train, X_test, x_cols, RE_cols_prefix, d, 'vae-ignore',
            thresh, epochs, qs, q_spatial, n_sig2bs, n_sig2bs_spatial, est_cors, batch_size, patience, n_neurons, n_neurons_re, dropout,
            activation, mode, beta, re_prior, kernel_root, pred_unknown_clusters, max_spatial_locs, time2measure_dict, verbose, U, B_list)
    print('   finished vae-ignore, mse: %.3f' % mse_vaeig)
    mse_vaeem, _, _, n_epochs_vaeem, time_vaeem, losses_vaeem = run_dim_reduction(X_train, X_test, x_cols, RE_cols_prefix, d, 'vae-embed',
            thresh, epochs, qs, q_spatial, n_sig2bs, n_sig2bs_spatial, est_cors, batch_size, patience, n_neurons, n_neurons_re, dropout,
            activation, mode, beta, re_prior, kernel_root, pred_unknown_clusters, max_spatial_locs, time2measure_dict, verbose, U, B_list)
    print('   finished vae-embed, mse: %.3f' % mse_vaeem)
    mse_vrae, _, _, n_epochs_vrae, time_vrae, losses_vrae = run_dim_reduction(X_train, X_test, x_cols, RE_cols_prefix, d, 'vrae',
            thresh, epochs, qs, q_spatial, n_sig2bs, n_sig2bs_spatial, est_cors, batch_size, patience, n_neurons, n_neurons_re, dropout,
            activation, mode, beta, re_prior, kernel_root, pred_unknown_clusters, max_spatial_locs, time2measure_dict, verbose, U, B_list)
    print('   finished vrae, mse: %.3f' % mse_vrae)
    res.loc[next(counter)] = [d, beta, re_prior, i, 'svgpvae_16_2', mse_svgpvae_16_2, np.nan, np.nan, n_epochs_svgpvae_16_2, time_svgpvae_16_2] + losses_svgpvae_16_2
    res.loc[next(counter)] = [d, beta, re_prior, i, 'lmmvae', mse_lmmvae, sigmas[1][0], sigmas[1][1], n_epochs_lmmvae, time_lmmvae] + losses_lmmvae
    res.loc[next(counter)] = [d, beta, re_prior, i, 'pca-ignore', mse_ig, np.nan, np.nan, n_epochs_ig, time_ig] + losses_ig
    res.loc[next(counter)] = [d, beta, re_prior, i, 'pca-ohe', mse_ohe, np.nan, np.nan, n_epochs_ohe, time_ohe] + losses_ohe
    res.loc[next(counter)] = [d, beta, re_prior, i, 'vae-ignore', mse_vaeig, np.nan, np.nan, n_epochs_vaeig, time_vaeig] + losses_vaeig
    res.loc[next(counter)] = [d, beta, re_prior, i, 'vae-embed', mse_vaeem, np.nan, np.nan,n_epochs_vaeem, time_vaeem] + losses_vaeem
    res.loc[next(counter)] = [d, beta, re_prior, i, 'vrae', mse_vrae, np.nan, np.nan,n_epochs_vrae, time_vrae] + losses_vrae
    res.to_csv('res_rossmann_random.csv')

In [13]:
betas = [0.001]
ds = [1, 2, 5]
re_priors = [1.0]

In [None]:
# Future mode
if longitudinal_predict_future:
  # test set is "the future" or those obs with largest t
  X.sort_values(['t'], inplace=True)
  X, X_future = train_test_split(X, test_size=0.2, shuffle=False)
  X.index = np.arange(X.shape[0])

In [15]:
# Random mode
for beta in betas:
  for d in ds:
    for re_prior in re_priors:
      print(f'beta: {beta}, d: {d}, re_prior: {re_prior}:')
      for i, (train_index, test_index) in enumerate(kf.split(X)):
        print('  iteration %d' % i)
        if longitudinal_predict_future:
          # X_test would be "the future", X_train would be a sample from "the past"
          X_train, X_test = X.loc[train_index].copy(), X_future.copy()
        else:
          X_train, X_test = X.loc[train_index].copy(), X.loc[test_index].copy()
        scaler = StandardScaler()
        X_train[x_cols_to_scale] = scaler.fit_transform(X_train[x_cols_to_scale])
        X_test[x_cols_to_scale] = scaler.transform(X_test[x_cols_to_scale])
        iterate_reg_types(X_train, X_test, counter, d, beta, re_prior, i, verbose=False)