In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
import time
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

from lmmnn.nn import reg_nn_ohe_or_ignore, reg_nn_embed, reg_nn_lmm
from lmmnn.simulation import Count

import tensorflow.keras.backend as K
import tensorflow as tf

In [None]:
# Note: data_cleaned_train_comments_X.csv is the result of an ETL process described in Kalehbasti et. al. (2019), see our paper.
# We followed the script in their Github repo exactly.
path = '../../AirBnbPricePrediction/Data'
X_train = pd.read_csv(path + 'data_cleaned_train_comments_X.csv')
y_train = pd.read_csv(path + 'data_cleaned_train_y.csv').values
y_train = y_train.reshape(len(y_train), )

X_val = pd.read_csv(path + 'data_cleaned_val_comments_X.csv')
y_val = pd.read_csv(path + 'data_cleaned_val_y.csv').values
y_val = y_val.reshape(len(y_val), )

X_test = pd.read_csv(path + 'data_cleaned_test_comments_X.csv')
y_test = pd.read_csv(path + 'data_cleaned_test_y.csv').values
y_test = y_test.reshape(len(y_test), )

In [None]:
coeffs = np.load(path + 'selected_coefs.npy')
col_set = set()

for i in range(len(coeffs)):
    if coeffs[i]:
        col_set.add(X_train.columns[i])
X_train = X_train[list(col_set | set(['longitude', 'latitude', 'host_id']))]
X_val = X_val[list(col_set | set(['longitude', 'latitude', 'host_id']))]
X_test = X_test[list(col_set | set(['longitude', 'latitude', 'host_id']))]

X = pd.concat([X_train, X_val, X_test], ignore_index=True)
y = np.concatenate([y_train, y_val, y_test])

In [None]:
print(len(col_set))
print(X.shape)
print(y.shape)
X.head()

In [None]:
'Air_conditioning' in col_set

In [None]:
# define y as the binary air conditioning and attach previous y (log(price)) to X
X['price'] = y
y = X['Air_conditioning']
X.drop(['Air_conditioning'], axis=1, inplace=True)

In [None]:
y.value_counts()

In [None]:
X.rename(columns={'host_id': 'z0'}, inplace=True)

In [None]:
[col for col in X.columns if col.startswith('z')]

In [None]:
batch = 100
epochs = 500
patience = 10
qs = [len(X['z0'].unique())]
q_spatial = None
n_neurons = [10, 3]
dropout = []
activation = 'relu'
Z_non_linear = False
Z_embed_dim_pct = 10
mode = 'glmm'
n_sig2bs = 1
n_sig2bs_spatial = 0
est_cors = []
time2measure_dict = None
spatial_embed_neurons = None
resultion = None
verbose = True
log_params = False
idx = None
shuffle = False
resolution = 100
dist_matrix = None

In [None]:
def reg_nn(X_train, X_test, y_train, y_test, reg_type):
    start = time.time()
    if reg_type == 'ohe':
        y_pred, sigmas, _, _, n_epochs = reg_nn_ohe_or_ignore(
            X_train, X_test, y_train, y_test, qs, x_cols, batch, epochs, patience,
            n_neurons, dropout, activation, mode, n_sig2bs, n_sig2bs_spatial, est_cors, verbose)
    elif reg_type == 'lmm':
        y_pred, sigmas, _, _, n_epochs = reg_nn_lmm(
            X_train, X_test, y_train, y_test, qs, q_spatial, x_cols, batch, epochs, patience,
            n_neurons, dropout, activation, mode,
            n_sig2bs, n_sig2bs_spatial, est_cors, dist_matrix, spatial_embed_neurons,
            verbose, Z_non_linear, Z_embed_dim_pct, log_params, idx, shuffle)
    elif reg_type == 'ignore':
        y_pred, sigmas, _, _, n_epochs = reg_nn_ohe_or_ignore(
            X_train, X_test, y_train, y_test, qs, x_cols, batch, epochs, patience,
            n_neurons, dropout, activation, mode, n_sig2bs, n_sig2bs_spatial, est_cors, verbose, ignore_RE=True)
    elif reg_type == 'embed':
        y_pred, sigmas, _, _, n_epochs = reg_nn_embed(
            X_train, X_test, y_train, y_test, qs, q_spatial, x_cols, batch, epochs, patience,
            n_neurons, dropout, activation, mode, n_sig2bs, n_sig2bs_spatial, est_cors, verbose)
    else:
        raise ValueError(reg_type + 'is an unknown reg_type')
    end = time.time()
    K.clear_session()
    gc.collect()
    auc = roc_auc_score(y_test, y_pred)
    # plt.scatter(y_test, y_pred, alpha = 0.5)
    sns.boxplot(x='y_test', y='y_pred', data=pd.DataFrame({'y_test': y_test, 'y_pred': y_pred}), boxprops=dict(alpha=0.5))
    plt.show()
    return auc, sigmas, n_epochs, end - start

In [None]:
res = pd.DataFrame(columns=['experiment', 'exp_type', 'auc', 'sigma_b_est', 'n_epoch', 'time'])
counter = 0

def iterate_reg_types(X_train, X_test, y_train, y_test):
    auc_lmm, sigmas, n_epochs_lmm, time_lmm = reg_nn(X_train, X_test, y_train, y_test, reg_type='lmm')
    print(' finished lmmnn, auc: %.4f' % (auc_lmm))
    # auc_ig, _, n_epochs_ig, time_ig = reg_nn(X_train, X_test, y_train, y_test, reg_type='ignore')
    # print(' finished ignore, auc: %.4f' % (auc_ig))
    # auc_em, _, n_epochs_em, time_em = reg_nn(X_train, X_test, y_train, y_test, reg_type='embed')
    # print(' finished embed, auc: %.4f' % (auc_em))
    res.loc[next(counter)] = [i, 'lmm', auc_lmm, sigmas[1][0], n_epochs_lmm, time_lmm]
    # res.loc[next(counter)] = [i, 'ignore', auc_ig, np.nan, n_epochs_ig, time_ig]
    # res.loc[next(counter)] = [i, 'embed', auc_em, np.nan,  n_epochs_em, time_em]

kf = KFold(n_splits=5, shuffle=True, random_state=42)
counter = Count().gen()

x_cols = [col for col in X.columns if col not in ['z0']]
x_cols_to_scale = ['price']

for i, (train_index, test_index) in enumerate(kf.split(X, y)):
    print('iteration %d' % i)
    X_train, X_test, y_train, y_test = X.loc[train_index].copy(), X.loc[test_index].copy(), y[train_index], y[test_index]
    y_train = pd.Series(y_train, index=X_train.index)
    y_test = pd.Series(y_test, index=X_test.index)
    scaler = StandardScaler()
    X_train[x_cols_to_scale] = scaler.fit_transform(X_train[x_cols_to_scale])
    X_test[x_cols_to_scale] = scaler.transform(X_test[x_cols_to_scale])
    iterate_reg_types(X_train, X_test, y_train, y_test)

In [None]:
res

In [None]:
res.groupby('exp_type')['auc'].mean()

In [None]:
res.to_csv('../../results/res_airbnb_glmm.csv')