In [3]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [4]:
# Spatial implementation suffers in time on a GPU due to a TF bug, therefore consider disabling it
# Also, due to a TF bug there seems to be a memory leakage between LMMNN and other methods when run together,
# and those other methods suffer in performance. Consider running LMMNNN separately.
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [5]:
import time
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist, squareform

from sklearn.model_selection import KFold

from lmmnn.nn import reg_nn_ohe_or_ignore, reg_nn_lmm, reg_nn_embed
from lmmnn.simulation import Count

In [6]:
# Note: data_cleaned_train_comments_X.csv is the result of an ETL process described in Kalehbasti et. al. (2019), see our paper.
# We followed the script in their Github repo exactly.
path = 'AirBnbPricePrediction/Data/'
X_train = pd.read_csv(path + 'data_cleaned_train_comments_X.csv')
y_train = pd.read_csv(path + 'data_cleaned_train_y.csv').values
y_train = y_train.reshape(len(y_train), )

X_val = pd.read_csv(path + 'data_cleaned_val_comments_X.csv')
y_val = pd.read_csv(path + 'data_cleaned_val_y.csv').values
y_val = y_val.reshape(len(y_val), )

X_test = pd.read_csv(path + 'data_cleaned_test_comments_X.csv')
y_test = pd.read_csv(path + 'data_cleaned_test_y.csv').values
y_test = y_test.reshape(len(y_test), )

In [7]:
coeffs = np.load(path + 'selected_coefs.npy')
col_set = set()

for i in range(len(coeffs)):
    if coeffs[i]:
        col_set.add(X_train.columns[i])
X_train = X_train[list(col_set | set(['longitude', 'latitude']))]
X_val = X_val[list(col_set | set(['longitude', 'latitude']))]
X_test = X_test[list(col_set | set(['longitude', 'latitude']))]

X = pd.concat([X_train, X_val, X_test], ignore_index=True)
y = np.concatenate([y_train, y_val, y_test])

In [8]:
X['id'] = np.arange(X.shape[0])

In [9]:
len(col_set)

196

In [10]:
print(X.shape) # longitude was already in col_set and we've added id column to be able to re-sort the data after the join
print(y.shape)

(49976, 198)
(49976,)


In [11]:
X[['latitude', 'longitude']].head()

Unnamed: 0,latitude,longitude
0,0.415853,0.61404
1,0.466541,0.510596
2,0.427307,0.687932
3,0.553778,0.479174
4,0.581165,0.479014


In [12]:
X[['latitude', 'longitude']] = X[['latitude', 'longitude']].round(2)
X[['latitude', 'longitude']].head()

Unnamed: 0,latitude,longitude
0,0.42,0.61
1,0.47,0.51
2,0.43,0.69
3,0.55,0.48
4,0.58,0.48


In [13]:
location_df = X.groupby(['latitude', 'longitude']).size().sort_values(ascending=False).index.to_frame()
location_df['location'] = np.arange(location_df.shape[0])
location_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,latitude,longitude,location
latitude,longitude,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.64,0.48,0.64,0.48,0
0.64,0.47,0.64,0.47,1
0.55,0.48,0.55,0.48,2
0.63,0.47,0.63,0.47,3
0.55,0.49,0.55,0.49,4


In [14]:
X = X.set_index(['latitude', 'longitude']).join(location_df[['location']]).reset_index().sort_values(by=['id']).drop(['id'], axis=1)
X.index = np.arange(X.shape[0])
X.shape

(49976, 198)

In [15]:
X.head()

Unnamed: 0,latitude,longitude,Dryer,Room-darkening_shades,Hotel,Coffee_maker,Loft,Hell's Kitchen,Gym,Bensonhurst,...,Inwood,google_verification,Boutique hotel,accommodates,a few days or more,First_aid_kit,Bed_linens,SoHo,Game_console,location
0,0.42,0.61,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.133333,0.0,0.0,0.0,0.0,0.0,556
1,0.47,0.51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.066667,0.0,1.0,0.0,0.0,0.0,177
2,0.43,0.69,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.266667,0.0,0.0,1.0,0.0,0.0,939
3,0.55,0.48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,2
4,0.58,0.48,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,168


In [16]:
# relevant Adam params taken from original paper
NUM_ITERATIONS = 1000
BATCH_SIZE = 256
LEARNING_RATE = 0.001
DECAY_RATE = 0.0001

In [17]:
n_cats = len(X['location'].unique())
print(n_cats)
print(X['location'].max())

2880
2879


In [18]:
mode = 'spatial'
n_sig2bs = 2
est_cors = []
n_neurons = [20, 5]
dropout = None
activation = 'relu'

In [19]:
coords = location_df[['latitude', 'longitude']].values
dist_matrix = squareform(pdist(coords)) ** 2
dist_matrix.shape

(2880, 2880)

In [20]:
dist_matrix[:5,:5]

array([[0.    , 0.0001, 0.0081, 0.0002, 0.0082],
       [0.0001, 0.    , 0.0082, 0.0001, 0.0085],
       [0.0081, 0.0082, 0.    , 0.0065, 0.0001],
       [0.0002, 0.0001, 0.0065, 0.    , 0.0068],
       [0.0082, 0.0085, 0.0001, 0.0068, 0.    ]])

In [21]:
X.rename(columns={'latitude': 'D1', 'longitude': 'D2', 'location': 'z0'}, inplace=True)

In [22]:
def reg_nn(X_train, X_test, y_train, y_test, n_cats, batch=30, epochs=100, patience=10, reg_type='ohe', verbose=False):
    start = time.time()
    if reg_type == 'ohe':
        y_pred, sigmas, _, n_epochs = reg_nn_ohe_or_ignore(X_train, X_test, y_train, y_test, [n_cats], x_cols,
        batch, epochs, patience, n_neurons, dropout, activation, mode, n_sig2bs, est_cors, verbose)
    elif reg_type == 'lmm':
        y_pred, sigmas, _, n_epochs = reg_nn_lmm(X_train, X_test, y_train, y_test, [n_cats], x_cols,
        batch, epochs, patience, n_neurons, dropout, activation, mode, n_sig2bs, est_cors, dist_matrix, verbose)
    elif reg_type == 'ignore':
        y_pred, sigmas, _, n_epochs = reg_nn_ohe_or_ignore(X_train, X_test, y_train, y_test, [n_cats], x_cols,
        batch, epochs, patience, n_neurons, dropout, activation, mode, n_sig2bs, est_cors, verbose, ignore_RE=True)
    elif reg_type == 'embed':
        y_pred, sigmas, _, n_epochs = reg_nn_embed(X_train, X_test, y_train, y_test, [n_cats], x_cols,
        batch, epochs, patience, n_neurons, dropout, activation, mode, n_sig2bs, est_cors, verbose)
    # elif reg_type == 'menet':
    #     y_pred, sigmas, _, n_epochs = reg_nn_menet(X_train, X_test, y_train, y_test, n_cats, x_cols, batch,
    #     epochs, patience, n_neurons, dropout, activation)
    else:
      raise ValueError(reg_type + 'is an unknown reg_type')
    end = time.time()
    mse = np.mean((y_pred - y_test)**2)
    return mse, sigmas, n_epochs, end - start

In [25]:
res = pd.DataFrame(columns=['experiment', 'exp_type', 'mse', 'sigma_e_est', 'sigma_b0_est', 'sigma_b1_est', 'n_epoch', 'time'])

def iterate_reg_types(X_train, X_test, y_train, y_test, counter, verbose):
    mse_lmm, sigmas, n_epochs_lmm, time_lmm = reg_nn(X_train, X_test, y_train, y_test, n_cats, reg_type='lmm', verbose=verbose)
    print(' finished lmm, mse: %.2f' % mse_lmm)
    gc.collect()
    mse_ohe, _, n_epochs_ohe, time_ohe = reg_nn(X_train, X_test, y_train, y_test, n_cats, reg_type='ohe', verbose=verbose)
    print(' finished ohe, mse: %.2f' % mse_ohe)
    gc.collect()
    mse_ig, _, n_epochs_ig, time_ig = reg_nn(X_train, X_test, y_train, y_test, n_cats, reg_type='ignore', verbose=verbose)
    print(' finished ignore, mse: %.2f' % mse_ig)
    gc.collect()
    mse_em, _, n_epochs_em, time_em = reg_nn(X_train, X_test, y_train, y_test, n_cats, reg_type='embed', verbose=verbose)
    print(' finished embed, mse: %.2f' % mse_em)
    gc.collect()
    # mse_me, sigmas_me, n_epochs_me, time_me = reg_nn(X_train, X_test, y_train, y_test, n_cats, reg_type='menet', verbose=verbose)
    # print(' finished menet, mse: %.2f' % mse_me)
    res.loc[next(counter)] = [i, 'lmm', mse_lmm, sigmas[0], sigmas[1][0], sigmas[1][1], n_epochs_lmm, time_lmm]
    res.loc[next(counter)] = [i, 'ohe', mse_ohe, np.nan, np.nan, np.nan, n_epochs_ohe, time_ohe]
    res.loc[next(counter)] = [i, 'ignore', mse_ig, np.nan, np.nan, np.nan, n_epochs_ig, time_ig]
    res.loc[next(counter)] = [i, 'embed', mse_em, np.nan, np.nan, np.nan, n_epochs_em, time_em]
    # res.loc[next(counter)] = [i, 'menet', mse_me, sigmas_me[0], np.nan, np.nan, n_epochs_me, time_me]

kf = KFold(n_splits=5, shuffle=True, random_state=42)
counter = Count().gen()
x_cols = [col for col in X.columns if col != 'z0']

In [29]:
for i, (train_index, test_index) in enumerate(kf.split(X, y)):
    print('iteration %d' % i)
    X_train, X_test, y_train, y_test = X.loc[train_index], X.loc[test_index], y[train_index], y[test_index]
    # scaler = StandardScaler()
    # y_train = scaler.fit_transform(y_train.values.reshape(-1, 1)).reshape(X_train.shape[0])
    # y_test = scaler.transform(y_test.values.reshape(-1, 1)).reshape(X_test.shape[0])
    y_train = pd.Series(y_train, index=X_train.index)
    y_test = pd.Series(y_test, index=X_test.index)
    iterate_reg_types(X_train, X_test, y_train, y_test, counter, verbose=True)

In [28]:
res

In [None]:
res.to_csv('../results/res_airbnb_spatial.csv')