In [22]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

from lmmnn.layers import NLL
from lmmnn.callbacks import EarlyStoppingWithSigmasConvergence

from tensorflow.keras import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Layer, Input, Dropout, Embedding, Reshape, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, Callback
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K

import tensorflow as tf

In [24]:
# Note: data_cleaned_train_comments_X.csv is the result of an ETL process described in Kalehbasti et. al. (2019), see our paper.
# We followed the script in their Github repo exactly.
path = 'C:/Users/gsimchoni/AirBnbPricePrediction/Data/'
X_train = pd.read_csv(path + 'data_cleaned_train_comments_X.csv')
y_train = pd.read_csv(path + 'data_cleaned_train_y.csv').values
y_train = y_train.reshape(len(y_train), )

X_val = pd.read_csv(path + 'data_cleaned_val_comments_X.csv')
y_val = pd.read_csv(path + 'data_cleaned_val_y.csv').values
y_val = y_val.reshape(len(y_val), )

X_test = pd.read_csv(path + 'data_cleaned_test_comments_X.csv')
y_test = pd.read_csv(path + 'data_cleaned_test_y.csv').values
y_test = y_test.reshape(len(y_test), )

In [25]:
coeffs = np.load(path + 'selected_coefs.npy')
col_set = set()

for i in range(len(coeffs)):
        if (coeffs[i]):
            col_set.add(X_train.columns[i])
X_train = X_train[list(col_set | set(['host_id']))]
X_val = X_val[list(col_set | set(['host_id']))]
X_test = X_test[list(col_set | set(['host_id']))]

X = pd.concat([X_train, X_val, X_test], ignore_index=True)
y = np.concatenate([y_train, y_val, y_test])

In [26]:
print(X.shape)
print(y.shape)

(49976, 197)
(49976,)


In [27]:
# relevant Adam params taken from original paper
NUM_ITERATIONS = 1000
BATCH_SIZE = 256
LEARNING_RATE = 0.001
DECAY_RATE = 0.0001

In [28]:
n_cats = max([X_train['host_id'].max(), X_val['host_id'].max(), X_test['host_id'].max()]) + 1

In [29]:
len(col_set)

196

In [30]:
col_set

{'24-hour_check-in',
 'Accessible-height_bed',
 'Accessible-height_toilet',
 'Air_conditioning',
 'Apartment',
 'Arverne.1',
 'Astoria.1',
 'Baby_bath',
 'Babysitter_recommendations',
 'Bathtub',
 'Bay Ridge',
 'Bed_linens',
 'Bensonhurst',
 'Boerum Hill',
 'Borough Park',
 'Boutique hotel',
 'Breakfast',
 'Bronx',
 'Bronx.1',
 'Brooklyn ',
 'Brooklyn Heights',
 'Brooklyn.1',
 'Bushwick.1',
 'Buzzer/wireless_intercom',
 'Cable_TV',
 'Carbon_monoxide_detector',
 'Carroll Gardens',
 'Cat(s)',
 'Chelsea',
 'Children’s_books_and_toys',
 'Clinton Hill',
 'Cobble Hill',
 'Coffee_maker',
 'Cooking_basics',
 'Crib',
 'Cypress Hills',
 'DUMBO',
 'Dishes_and_silverware',
 'Dishwasher',
 'Doorman',
 'Downtown Brooklyn',
 'Dryer',
 'East Flatbush',
 'East Harlem',
 'East Village',
 'Elevator',
 'Elmhurst.1',
 'Entire home/apt',
 'Essentials',
 'Ethernet_connection',
 'Extra_pillows_and_blankets',
 'Family/kid_friendly',
 'Financial District',
 'Fire_extinguisher',
 'First_aid_kit',
 'Flat_path_to_

In [31]:
def calc_b_hat(X_train, y_train, y_pred_tr, n_cats, sig2e, sig2b, Z_name):
    b_hat = []
    for i in range(n_cats):
        i_vec = X_train[Z_name] == i
        n_i = i_vec.sum()
        if n_i > 0:
            y_bar_i = y_train[i_vec].mean()
            y_pred_i = y_pred_tr[i_vec].mean()
            # BP(b_i) = (n_i * sig2b / (sig2a + n_i * sig2b)) * (y_bar_i - y_pred_bar_i)
            b_i = n_i * sig2b * (y_bar_i - y_pred_i) / (sig2e + n_i * sig2b)
        else:
            b_i = 0
        b_hat.append(b_i)
    return np.array(b_hat)

def process_one_hot_encoding(X_train, X_test, RE_col):
    X_train_ohe = pd.concat([X_train[x_cols], pd.get_dummies(X_train[RE_col])], axis=1)
    X_test_ohe = pd.concat([X_test[x_cols], pd.get_dummies(X_test[RE_col])], axis=1)
    X_test_cols_in_train = set(X_test_ohe.columns).intersection(X_train_ohe.columns)
    X_train_cols_not_in_test = set(X_train_ohe.columns).difference(X_test_ohe.columns)
    X_test_comp = pd.DataFrame(np.zeros((X_test.shape[0], len(X_train_cols_not_in_test))),
                               columns=X_train_cols_not_in_test, dtype=np.uint8, index=X_test.index)
    X_test_ohe_comp = pd.concat([X_test_ohe[X_test_cols_in_train], X_test_comp], axis=1)
    X_test_ohe_comp = X_test_ohe_comp[X_train_ohe.columns]
    return X_train_ohe, X_test_ohe_comp

def reg_nn(X_train, X_test, y_train, y_test, batch=30, epochs=500, patience=10, reg_type='lm'):    
#     if reg_type == 'lm':
#         pass#y_pred, sigmas = reg_nn_lm(X_train, X_test, y_train, y_test, batch, epochs, patience, deep)
    if reg_type == 'lmm':
        y_pred, sigmas = reg_nn_lmm(X_train, X_test, y_train, y_test, batch, epochs, patience)
    elif reg_type == 'ignore':
        y_pred, sigmas = reg_nn_ignore(X_train, X_test, y_train, y_test, batch, epochs, patience)
    else:
        y_pred, sigmas = reg_nn_embed(X_train, X_test, y_train, y_test, batch, epochs, patience)
    mse = np.mean((y_pred - y_test)**2)
    return mse, sigmas

def reg_nn_ignore(X_train, X_test, y_train, y_test, batch_size, epochs, patience):
    model = Sequential()
    
    model.add(Dense(units=20, activation='relu', input_dim=len(X_train[col_set].values[0])))
    model.add(Dense(units=5, activation='relu'))
    model.add(Dense(units=1, activation='linear'))
    adam = Adam(lr=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=None, decay=DECAY_RATE, amsgrad=False)
    model.compile(loss='mse', optimizer=adam)
    callbacks = [EarlyStopping(monitor='val_loss', patience=10)]
    history = model.fit(X_train[col_set], y_train, epochs=NUM_ITERATIONS, verbose=0,
                        batch_size=BATCH_SIZE, validation_split = 0.1, callbacks=callbacks)
    y_pred = model.predict(X_test[col_set]).reshape(X_test.shape[0])
    return y_pred, (None, None)

def reg_nn_embed(X_train, X_test, y_train, y_test, batch_size, epochs, patience):
    embed_dim = 100

    X_input = Input(shape=(X_train[col_set].shape[1],))
    Z_input = Input(shape=(1,))
    embed = Embedding(n_cats, embed_dim, input_length = 1)(Z_input)
    embed = Reshape(target_shape = (embed_dim,))(embed)
    concat = Concatenate()([X_input, embed])
    
    hidden1 = Dense(units=20, activation='relu', input_dim=len(X_train[col_set].values[0]))(concat)
    hidden2 = Dense(units=5, activation='relu')(hidden1)
    output = Dense(1, activation='linear')(hidden2)

    model = Model(inputs=[X_input, Z_input], outputs=output)
    adam = Adam(lr=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=None, decay=DECAY_RATE, amsgrad=False)
    model.compile(loss='mse', optimizer=adam)
    
    callbacks = [EarlyStopping(monitor='val_loss', patience=epochs if patience is None else patience)]
    history = model.fit([X_train[col_set], X_train['host_id']], y_train, batch_size=batch_size, epochs=epochs,
                        validation_split=0.1, callbacks=callbacks, verbose=0)
    y_pred = model.predict([X_test[col_set], X_test['host_id']]).reshape(X_test.shape[0])
    return y_pred, (None, None)

def reg_nn_lmm(X_train, X_test, y_train, y_test, batch_size, epochs, patience):
    X_input = Input(shape=(X_train[col_set].shape[1],))
    y_true_input = Input(shape=(1,))
    Z_input = Input(shape=(1,), dtype=tf.int64)
    hidden1 = Dense(units=20, activation='relu', input_dim=len(X_train[col_set].values[0]))(X_input)
    hidden2 = Dense(units=5, activation='relu')(hidden1)
    y_pred_output = Dense(1, activation='linear')(hidden2)
    nll = NLL(1.0, 1.0)(y_true_input, y_pred_output, Z_input)
    model = Model(inputs=[X_input, y_true_input, Z_input], outputs=nll)

    adam = Adam(lr=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=None, decay=DECAY_RATE, amsgrad=False)
    model.compile(optimizer= adam)

    callbacks = [EarlyStoppingWithSigmasConvergence(patience = 10)]
    history = model.fit([X_train[col_set], y_train, X_train['host_id']], None, batch_size=batch_size, epochs=epochs,
                        validation_split=0.1, callbacks=callbacks, verbose=0)

    sig2e_est, sig2b_est = model.layers[-1].get_vars()
    y_pred_tr = model.predict([X_train[col_set], y_train, X_train['host_id']]).reshape(X_train.shape[0])
    b_hat = calc_b_hat(X_train, y_train, y_pred_tr, n_cats, sig2e_est, sig2b_est, 'host_id')
    y_pred = model.predict([X_test[col_set], np.random.normal(size=y_test.shape), X_test['host_id']]).reshape(X_test.shape[0]) + b_hat[X_test['host_id']]
    return y_pred, (sig2e_est, sig2b_est)

In [32]:
res = pd.DataFrame(columns=['experiment', 'exp_type', 'deep', 'mse', 'sigma_e_est', 'sigma_b_est'])
counter = 0

def iterate_reg_types(X_train, X_test, y_train, y_test, deep):
    global counter
    mse_lmm, sigmas = reg_nn(X_train, X_test, y_train, y_test, reg_type='lmm')
    print(' finished lmm deep=%s, mse: %.2f' % (deep, mse_lmm))
    mse_ig, _ = reg_nn(X_train, X_test, y_train, y_test, reg_type='ignore')
    print(' finished ignore deep=%s, mse: %.2f' % (deep, mse_ig))
    mse_em, _ = reg_nn(X_train, X_test, y_train, y_test, reg_type='embed')
    print(' finished embed deep=%s, mse: %.2f' % (deep, mse_em))
    res.loc[counter + 0] = [i, 'lmm', deep, mse_lmm, sigmas[0], sigmas[1]]
    res.loc[counter + 1] = [i, 'ignore', deep, mse_ig, np.nan, np.nan]
    res.loc[counter + 2] = [i, 'embed', deep, mse_em, np.nan, np.nan]
    counter += 3

kf = KFold(n_splits=5)

for i, (train_index, test_index) in enumerate(kf.split(X, y)):
    print('iteration %d' % i)
    X_train, X_test, y_train, y_test = X.loc[train_index], X.loc[test_index], y[train_index], y[test_index]
    iterate_reg_types(X_train, X_test, y_train, y_test, True)

iteration 0
 finished lmm deep=True, mse: 0.14
 finished ignore deep=True, mse: 0.15
 finished embed deep=True, mse: 0.15
iteration 1
 finished lmm deep=True, mse: 0.14
 finished ignore deep=True, mse: 0.16
 finished embed deep=True, mse: 0.16
iteration 2
 finished lmm deep=True, mse: 0.14
 finished ignore deep=True, mse: 0.15
 finished embed deep=True, mse: 0.16
iteration 3
 finished lmm deep=True, mse: 0.14
 finished ignore deep=True, mse: 0.15
 finished embed deep=True, mse: 0.15
iteration 4
 finished lmm deep=True, mse: 0.15
 finished ignore deep=True, mse: 0.16
 finished embed deep=True, mse: 0.16


In [33]:
res

Unnamed: 0,experiment,exp_type,deep,mse,sigma_e_est,sigma_b_est
0,0,lmm,True,0.139199,0.066692,0.077929
1,0,ignore,True,0.150673,,
2,0,embed,True,0.150674,,
3,1,lmm,True,0.144429,0.061695,0.072743
4,1,ignore,True,0.16168,,
5,1,embed,True,0.163593,,
6,2,lmm,True,0.140581,0.062678,0.070015
7,2,ignore,True,0.152785,,
8,2,embed,True,0.158924,,
9,3,lmm,True,0.140788,0.062919,0.070368


In [34]:
res.to_csv('../results/res_airbnb.csv')