In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import time
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold

from lmmnn.layers import NLL
from lmmnn.nn import calc_b_hat
from lmmnn.menet import menet_fit, menet_predict
from lmmnn.simulation import Count

from tensorflow.keras import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Embedding, Reshape, Concatenate
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

import tensorflow as tf

In [3]:
tf.keras.backend.set_floatx('float64')
tf.keras.backend.floatx()

'float64'

In [4]:
# Note: data_cleaned_train_comments_X.csv is the result of an ETL process described in Kalehbasti et. al. (2019), see our paper.
# We followed the script in their Github repo exactly.

# Due to a TF bug there seems to be a memory leakage between LMMNN and other methods when run together,
# and those other methods suffer in performance. Consider running LMMNNN separately.

path = 'AirBnbPricePrediction/Data/'
X_train = pd.read_csv(path + 'data_cleaned_train_comments_X.csv')
y_train = pd.read_csv(path + 'data_cleaned_train_y.csv').values
y_train = y_train.reshape(len(y_train), )

X_val = pd.read_csv(path + 'data_cleaned_val_comments_X.csv')
y_val = pd.read_csv(path + 'data_cleaned_val_y.csv').values
y_val = y_val.reshape(len(y_val), )

X_test = pd.read_csv(path + 'data_cleaned_test_comments_X.csv')
y_test = pd.read_csv(path + 'data_cleaned_test_y.csv').values
y_test = y_test.reshape(len(y_test), )

In [5]:
coeffs = np.load(path + 'selected_coefs.npy')
col_set = set()

for i in range(len(coeffs)):
        if (coeffs[i]):
            col_set.add(X_train.columns[i])
X_train = X_train[list(col_set | set(['host_id']))]
X_val = X_val[list(col_set | set(['host_id']))]
X_test = X_test[list(col_set | set(['host_id']))]

X = pd.concat([X_train, X_val, X_test], ignore_index=True)
y = np.concatenate([y_train, y_val, y_test])

In [6]:
print(X.shape)
print(y.shape)

(49976, 197)
(49976,)


In [7]:
# relevant Adam params taken from original paper
NUM_ITERATIONS = 1000
BATCH_SIZE = 256
LEARNING_RATE = 0.001
DECAY_RATE = 0.0001

# original paper Adam configuration:
# adam = Adam(lr=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=None, decay=DECAY_RATE, amsgrad=False)

# some other params
mode = 'intercepts'
n_sig2bs = 1
est_cors = []
n_neurons = [20, 5]
dropout = None
activation = 'relu'

In [8]:
n_cats = max([X_train['host_id'].max(), X_val['host_id'].max(), X_test['host_id'].max()]) + 1

In [9]:
len(col_set)

196

In [10]:
col_set

{'24-hour_check-in',
 'Accessible-height_bed',
 'Accessible-height_toilet',
 'Air_conditioning',
 'Apartment',
 'Arverne.1',
 'Astoria.1',
 'Baby_bath',
 'Babysitter_recommendations',
 'Bathtub',
 'Bay Ridge',
 'Bed_linens',
 'Bensonhurst',
 'Boerum Hill',
 'Borough Park',
 'Boutique hotel',
 'Breakfast',
 'Bronx',
 'Bronx.1',
 'Brooklyn ',
 'Brooklyn Heights',
 'Brooklyn.1',
 'Bushwick.1',
 'Buzzer/wireless_intercom',
 'Cable_TV',
 'Carbon_monoxide_detector',
 'Carroll Gardens',
 'Cat(s)',
 'Chelsea',
 'Children’s_books_and_toys',
 'Clinton Hill',
 'Cobble Hill',
 'Coffee_maker',
 'Cooking_basics',
 'Crib',
 'Cypress Hills',
 'DUMBO',
 'Dishes_and_silverware',
 'Dishwasher',
 'Doorman',
 'Downtown Brooklyn',
 'Dryer',
 'East Flatbush',
 'East Harlem',
 'East Village',
 'Elevator',
 'Elmhurst.1',
 'Entire home/apt',
 'Essentials',
 'Ethernet_connection',
 'Extra_pillows_and_blankets',
 'Family/kid_friendly',
 'Financial District',
 'Fire_extinguisher',
 'First_aid_kit',
 'Flat_path_to_

In [11]:
X.rename(columns={'host_id': 'z0'}, inplace=True)
y = y.astype(np.float64)

In [12]:
def reg_nn_lmm(X_train, X_test, y_train, y_test, batch_size, epochs, patience, verbose):
    X_input = Input(shape=(X_train[col_set].shape[1],))
    y_true_input = Input(shape=(1,))
    Z_input = Input(shape=(1,), dtype=tf.int64)
    hidden1 = Dense(units=20, activation='relu', input_dim=len(X_train[col_set].values[0]))(X_input)
    hidden2 = Dense(units=5, activation='relu')(hidden1)
    y_pred_output = Dense(1, activation='linear')(hidden2)
    nll = NLL(mode, 1.0, [1.0])(y_true_input, y_pred_output, [Z_input])
    model = Model(inputs=[X_input, y_true_input, Z_input], outputs=nll)

    adam = Adam(learning_rate=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=None, decay=DECAY_RATE, amsgrad=False)
    model.compile(optimizer= adam)

    callbacks = [EarlyStopping(monitor='val_loss', patience=epochs if patience is None else patience)]
    
    X_train.reset_index(inplace=True)
    X_train.sort_values(by=['z0'], inplace=True)
    y_train = y_train[X_train.index]
    
    history = model.fit([X_train[col_set], y_train, X_train['z0']], None, batch_size=batch_size, epochs=epochs,
                        validation_split=0.1, callbacks=callbacks, verbose=verbose, shuffle=True)

    sig2e_est, sig2b_ests, _ = model.layers[-1].get_vars()
    y_pred_tr = model.predict([X_train[col_set], y_train, X_train['z0']]).reshape(X_train.shape[0])
    b_hat = calc_b_hat(X_train, y_train, y_pred_tr, [n_cats], sig2e_est, sig2b_ests,
                  False, model, None, mode, None, est_cors, None)
    y_pred = model.predict([X_test[col_set], np.random.normal(size=y_test.shape), X_test['z0']]).reshape(X_test.shape[0]) + b_hat[X_test['z0']]
    return y_pred, (sig2e_est, sig2b_ests), len(history.history['loss'])

In [13]:
def reg_nn_ignore(X_train, X_test, y_train, y_test, batch_size, epochs, patience, verbose):
    model = Sequential()
    
    model.add(Dense(units=20, activation='relu', input_dim=len(X_train[col_set].values[0])))
    model.add(Dense(units=5, activation='relu'))
    model.add(Dense(units=1, activation='linear'))
    adam = Adam(lr=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=None, decay=DECAY_RATE, amsgrad=False)
    model.compile(loss='mse', optimizer=adam)
    callbacks = [EarlyStopping(monitor='val_loss', patience=epochs if patience is None else patience)]
    history = model.fit(X_train[col_set], y_train, epochs=epochs, verbose=verbose,
                        batch_size=batch_size, validation_split = 0.1, callbacks=callbacks)
    y_pred = model.predict(X_test[col_set]).reshape(X_test.shape[0])
    return y_pred, (None, None), len(history.history['loss'])

In [14]:
def reg_nn_embed(X_train, X_test, y_train, y_test, batch_size, epochs, patience, verbose):
    embed_dim = 100

    X_input = Input(shape=(X_train[col_set].shape[1],))
    Z_input = Input(shape=(1,))
    embed = Embedding(n_cats, embed_dim, input_length = 1)(Z_input)
    embed = Reshape(target_shape = (embed_dim,))(embed)
    concat = Concatenate()([X_input, embed])
    
    hidden1 = Dense(units=20, activation='relu', input_dim=len(X_train[col_set].values[0]))(concat)
    hidden2 = Dense(units=5, activation='relu')(hidden1)
    output = Dense(1, activation='linear')(hidden2)

    model = Model(inputs=[X_input, Z_input], outputs=output)
    adam = Adam(lr=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=None, decay=DECAY_RATE, amsgrad=False)
    model.compile(loss='mse', optimizer=adam)
    
    callbacks = [EarlyStopping(monitor='val_loss', patience=epochs if patience is None else patience)]
    history = model.fit([X_train[col_set], X_train['z0']], y_train, batch_size=batch_size, epochs=epochs,
                        validation_split=0.1, callbacks=callbacks, verbose=verbose)
    y_pred = model.predict([X_test[col_set], X_test['z0']]).reshape(X_test.shape[0])
    return y_pred, (None, None), len(history.history['loss'])

In [15]:
def reg_nn_menet(X_train, X_test, y_train, y_test, batch_size, epochs, patience, verbose):
    q = n_cats
    clusters_train, clusters_test = X_train['z0'].values, X_test['z0'].values
    X_train, X_test = X_train[col_set].values, X_test[col_set].values
    # y_train, y_test = y_train.values, y_test.values

    model = Sequential()
    model.add(Dense(units=20, activation='relu', input_dim=X_train.shape[1]))
    model.add(Dense(units=5, activation='relu'))
    model.add(Dense(units=1, activation='linear'))
    adam = Adam(lr=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=None, decay=DECAY_RATE, amsgrad=False)
    model.compile(loss='mse', optimizer=adam)

    model, b_hat, sig2e_est, n_epochs, _ = menet_fit(model, X_train, y_train, clusters_train, q, batch_size, epochs, patience, verbose=verbose)
    y_pred = menet_predict(model, X_test, clusters_test, q, b_hat)
    return y_pred, (sig2e_est, None), n_epochs

In [16]:
def reg_nn(X_train, X_test, y_train, y_test, batch=30, epochs=100, patience=10, reg_type='ohe', verbose=False):
    start = time.time()
    if reg_type == 'ohe':
        y_pred, sigmas, n_epochs = reg_nn_ohe(X_train, X_test, y_train, y_test, batch, epochs, patience, verbose)
    elif reg_type == 'lmm':
        y_pred, sigmas, n_epochs = reg_nn_lmm(X_train, X_test, y_train, y_test, batch, epochs, patience, verbose)
    elif reg_type == 'ignore':
        y_pred, sigmas, n_epochs = reg_nn_ignore(X_train, X_test, y_train, y_test, batch, epochs, patience, verbose)
    elif reg_type == 'embed':
        y_pred, sigmas, n_epochs = reg_nn_embed(X_train, X_test, y_train, y_test, batch, epochs, patience, verbose)
    elif reg_type == 'menet':
        y_pred, sigmas, n_epochs = reg_nn_menet(X_train, X_test, y_train, y_test, batch, epochs, patience, verbose)
    else:
      raise ValueError(reg_type + 'is an unknown reg_type')
    end = time.time()
    mse = np.mean((y_pred - y_test)**2)
    return mse, sigmas, n_epochs, end - start

In [17]:
res = pd.DataFrame(columns=['experiment', 'exp_type', 'mse', 'sigma_e_est', 'sigma_b0_est', 'n_epoch', 'time'])
kf = KFold(n_splits=5)
counter = Count().gen()
x_cols = [col for col in X.columns if col != 'z0']

In [18]:
def iterate_reg_types(X_train, X_test, y_train, y_test, counter, verbose):
    mse_lmm, sigmas, n_epochs_lmm, time_lmm = reg_nn(X_train, X_test, y_train, y_test, reg_type='lmm', verbose=verbose)
    print(' finished lmm, mse: %.2f' % mse_lmm)
    # mse_ohe, _, n_epochs_ohe, time_ohe = reg_nn(X_train, X_test, y_train, y_test, reg_type='ohe', verbose=verbose)
    # print(' finished ohe, mse: %.2f' % mse_ohe)
    mse_ig, _, n_epochs_ig, time_ig = reg_nn(X_train, X_test, y_train, y_test, reg_type='ignore', verbose=verbose)
    print(' finished ignore, mse: %.2f' % mse_ig)
    mse_em, _, n_epochs_em, time_em = reg_nn(X_train, X_test, y_train, y_test, reg_type='embed', verbose=verbose)
    print(' finished embed, mse: %.2f' % mse_em)
    mse_me, sigmas_me, n_epochs_me, time_me = reg_nn(X_train, X_test, y_train, y_test, reg_type='menet', verbose=verbose)
    print(' finished menet, mse: %.2f' % mse_me)
    res.loc[next(counter)] = [i, 'lmm', mse_lmm, sigmas[0], sigmas[1][0], n_epochs_lmm, time_lmm]
    # res.loc[next(counter)] = [i, 'ohe', mse_ohe, np.nan, np.nan, n_epochs_ohe, time_ohe]
    res.loc[next(counter)] = [i, 'ignore', mse_ig, np.nan, np.nan, n_epochs_ig, time_ig]
    res.loc[next(counter)] = [i, 'embed', mse_em, np.nan, np.nan, n_epochs_em, time_em]
    res.loc[next(counter)] = [i, 'menet', mse_me, sigmas_me[0], np.nan, n_epochs_me, time_me]

In [20]:
for i, (train_index, test_index) in enumerate(kf.split(X, y)):
    print('iteration %d' % i)
    X_train, X_test, y_train, y_test = X.loc[train_index], X.loc[test_index], y[train_index], y[test_index]
    # scaler = StandardScaler()
    # y_train = scaler.fit_transform(y_train.values.reshape(-1, 1)).reshape(X_train.shape[0])
    # y_test = scaler.transform(y_test.values.reshape(-1, 1)).reshape(X_test.shape[0])
    y_train = pd.Series(y_train, index=X_train.index)
    y_test = pd.Series(y_test, index=X_test.index)
    iterate_reg_types(X_train, X_test, y_train, y_test, counter, verbose=True)

In [22]:
res

In [None]:
res.to_csv('../results/res_airbnb.csv')