In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist, squareform

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

from lmmvae.dim_reduction import run_dim_reduction
from lmmvae.simulation import Count
from lmmvae.utils import get_dummies, get_posterior_b_root

In [3]:
cars = pd.read_csv('../../data/cars_df5.csv')

In [4]:
print(cars.shape)
cars.head()

(97729, 78)


Unnamed: 0,price,year,odometer,lat,long,model_id,location_id,manufacturerbmw,manufacturerchevrolet,manufacturerdodge,...,paint_colorcustom,paint_colorgreen,paint_colorgrey,paint_color_na,paint_colororange,paint_colorpurple,paint_colorred,paint_colorsilver,paint_colorwhite,paint_coloryellow
0,33590,0.251077,0.57923,-3.416363,3.749557,12003,1655,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,22590,-0.318553,0.71229,-3.416363,3.749557,12239,1655,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,39590,1.105521,0.1916,-3.416363,3.749557,12278,1655,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,30990,0.678299,0.41124,-3.416363,3.749557,14156,1655,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,27990,-0.033738,0.68696,-3.416363,3.749557,12089,1655,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
coords = cars.groupby(['location_id','lat', 'long']).size().index.to_frame().values
coords[:5]

array([[  0.        , -10.        ,   5.05988236],
       [  1.        ,  -9.90823057,   5.15626107],
       [  2.        ,  -9.62636732,   5.50634257],
       [  3.        ,  -9.5698308 ,   5.53539792],
       [  4.        ,  -9.17735262,   5.5679966 ]])

In [6]:
dist_matrix = squareform(pdist(coords[:,1:])) ** 2
dist_matrix.shape

(12235, 12235)

In [7]:
# spatial features name change
cars.rename({'lat': 'D1', 'long': 'D2', 'location_id': 'z0', 'model_id': 'z1'}, axis=1, inplace=True)

In [8]:
X = cars.drop('price', axis=1)

In [9]:
# special features: t for longitudinal, D1 and D2 for spatial (longitude, latitude)
new_cols = [col for col in X.columns if col not in ['D1', 'D2', 'z0', 'z1']] + ['D1', 'D2', 'z0', 'z1']
X = X[new_cols]
X.columns

Index(['year', 'odometer', 'manufacturerbmw', 'manufacturerchevrolet',
       'manufacturerdodge', 'manufacturerford', 'manufacturergmc',
       'manufacturerhonda', 'manufacturerjeep', 'manufacturernissan',
       'manufacturerother', 'manufacturerram', 'manufacturertoyota',
       'conditionexcellent', 'conditionfair', 'conditiongood',
       'conditionlike_new', 'condition_na', 'conditionnew', 'conditionsalvage',
       'fueldiesel', 'fuelelectric', 'fuelgas', 'fuelhybrid', 'fuel_na',
       'fuelother', 'title_statusclean', 'title_statuslien',
       'title_statusmissing', 'title_status_na', 'title_statusparts_only',
       'title_statusrebuilt', 'title_statussalvage', 'transmissionautomatic',
       'transmissionmanual', 'transmission_na', 'transmissionother',
       'drive4wd', 'drivefwd', 'drive_na', 'driverwd', 'sizecompact',
       'sizefull_size', 'sizemid_size', 'size_na', 'sizesub_compact',
       'typebus', 'typeconvertible', 'typecoupe', 'typehatchback',
       'typemini_

In [10]:
# params for LMMVAE and other methods, some unnecessary for current use-case therefore are none
mode = 'spatial_and_categorical'
n_sig2bs = 1 # one categorical feature (model)
n_sig2bs_spatial = 2
n_neurons = [1000, 500]
dropout = None
activation = 'relu'
RE_cols_prefix = 'z'
thresh = None
epochs = 200
qs = [len(X['z1'].unique())]
q_spatial = len(X['z0'].unique())
batch_size = 1000
patience = None
kernel = np.exp(-dist_matrix / (2 * 1))
Z = get_dummies(X['z0'], q_spatial)
kernel_root = get_posterior_b_root(kernel, Z, sig2e=1, n=X.shape[0], n_samp=10000)
U = None
B_list = None
est_cors = []
n_neurons_re = n_neurons
max_spatial_locs = 100
time2measure_dict = None
pred_unknown_clusters = False # Change for Unknown mode

In [11]:
res = pd.DataFrame(columns=['d', 'beta', 're_prior', 'experiment', 'exp_type', 'mse_X', 'sigma_b0_spatial_est', 'n_epoch', 'time',
                            'total_loss_tr', 'recon_loss_tr', 'kl_loss_tr', 're_kl_loss_tr', 'total_loss_te',
                            'recon_loss_te', 'kl_loss_te', 're_kl_loss_te'])
kf = KFold(n_splits=5, shuffle=True, random_state=40)
counter = Count().gen()
x_cols = [col for col in X.columns if col not in ['z0', 'z1']]
x_cols_to_scale = [col for col in x_cols if col not in ['D1', 'D2']]

In [12]:
def iterate_reg_types(X_train, X_test, counter, d, beta, re_prior, i, verbose):
    mse_lmmvae, sigmas, _, n_epochs_lmmvae, time_lmmvae, losses_lmmvae = run_dim_reduction(X_train, X_test, x_cols, RE_cols_prefix, d, 'lmmvae',
            thresh, epochs, qs, q_spatial, n_sig2bs, n_sig2bs_spatial, est_cors, batch_size, patience, n_neurons, n_neurons_re, dropout,
            activation, mode, beta, re_prior, kernel_root, pred_unknown_clusters, max_spatial_locs, time2measure_dict, verbose, U, B_list)
    print('   finished lmmvae, mse: %.3f' % mse_lmmvae)
    mse_ig, _, _, n_epochs_ig, time_ig, losses_ig = run_dim_reduction(X_train, X_test, x_cols, RE_cols_prefix, d, 'pca-ignore',
            thresh, epochs, qs, q_spatial, n_sig2bs, n_sig2bs_spatial, est_cors, batch_size, patience, n_neurons, n_neurons_re, dropout,
            activation, mode, beta, re_prior, kernel_root, pred_unknown_clusters, max_spatial_locs, time2measure_dict, verbose, U, B_list)
    print('   finished pca-ignore, mse: %.3f' % mse_ig)
    mse_vaeig, _, _, n_epochs_vaeig, time_vaeig, losses_vaeig = run_dim_reduction(X_train, X_test, x_cols, RE_cols_prefix, d, 'vae-ignore',
            thresh, epochs, qs, q_spatial, n_sig2bs, n_sig2bs_spatial, est_cors, batch_size, patience, n_neurons, n_neurons_re, dropout,
            activation, mode, beta, re_prior, kernel_root, pred_unknown_clusters, max_spatial_locs, time2measure_dict, verbose, U, B_list)
    print('   finished vae-ignore, mse: %.3f' % mse_vaeig)
    mse_vaeem, _, _, n_epochs_vaeem, time_vaeem, losses_vaeem = run_dim_reduction(X_train, X_test, x_cols, RE_cols_prefix, d, 'vae-embed',
            thresh, epochs, qs, q_spatial, n_sig2bs, n_sig2bs_spatial, est_cors, batch_size, patience, n_neurons, n_neurons_re, dropout,
            activation, mode, beta, re_prior, kernel_root, pred_unknown_clusters, max_spatial_locs, time2measure_dict, verbose, U, B_list)
    print('   finished vae-embed, mse: %.3f' % mse_vaeem)
    res.loc[next(counter)] = [d, beta, re_prior, i, 'lmmvae', mse_lmmvae, sigmas[2][0], n_epochs_lmmvae, time_lmmvae] + losses_lmmvae
    res.loc[next(counter)] = [d, beta, re_prior, i, 'pca-ignore', mse_ig, np.nan, n_epochs_ig, time_ig] + losses_ig
    res.loc[next(counter)] = [d, beta, re_prior, i, 'vae-ignore', mse_vaeig, np.nan, n_epochs_vaeig, time_vaeig] + losses_vaeig
    res.loc[next(counter)] = [d, beta, re_prior, i, 'vae-em', mse_vaeem, np.nan, n_epochs_vaeem, time_vaeem] + losses_vaeem
    res.to_csv('res_cars_random.csv')

In [13]:
betas = [0.01]
ds = [1, 2, 5]
re_priors = [0.01]

In [None]:
# Unknown mode
if pred_unknown_clusters:
  for beta in betas:
    for d in ds:
      for re_prior in re_priors:
        print(f'beta: {beta}, d: {d}, re_prior: {re_prior}:')
        cluster_q = q_spatial
        for i, (train_clusters, test_clusters) in enumerate(kf.split(range(cluster_q))):
          print('  iteration %d' % i)
          X_train, X_test = X[X['z0'].isin(train_clusters)].copy(), X[X['z0'].isin(test_clusters)].copy()
          iterate_reg_types(X_train, X_test, counter, d, beta, re_prior, i, verbose=False)

In [15]:
# Random mode
for beta in betas:
  for d in ds:
    for re_prior in re_priors:
      print(f'beta: {beta}, d: {d}, re_prior: {re_prior}:')
      for i, (train_index, test_index) in enumerate(kf.split(X)):
        print('  iteration %d' % i)
        X_train, X_test = X.loc[train_index].copy(), X.loc[test_index].copy()
        iterate_reg_types(X_train, X_test, counter, d, beta, re_prior, i, verbose=False)