In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
import time
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist, squareform

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from lmmnn.nn import reg_nn_ohe_or_ignore, reg_nn_embed, reg_nn_lmm, reg_nn_svdkl, reg_nn_cnn
from lmmnn.simulation import Count

import tensorflow.keras.backend as K

In [None]:
# data_cleaned_train_comments_X.csv and the rest are the result of an ETL process described in Kalehbasti et. al. (2019).
# We followed the script in their Github repo exactly.
path = '../../AirBnbPricePrediction/Data'
X_train = pd.read_csv(path + 'data_cleaned_train_comments_X.csv')
y_train = pd.read_csv(path + 'data_cleaned_train_y.csv').values
y_train = y_train.reshape(len(y_train), )

X_val = pd.read_csv(path + 'data_cleaned_val_comments_X.csv')
y_val = pd.read_csv(path + 'data_cleaned_val_y.csv').values
y_val = y_val.reshape(len(y_val), )

X_test = pd.read_csv(path + 'data_cleaned_test_comments_X.csv')
y_test = pd.read_csv(path + 'data_cleaned_test_y.csv').values
y_test = y_test.reshape(len(y_test), )

coeffs = np.load(path + 'selected_coefs.npy')
col_set = set()

for i in range(len(coeffs)):
    if coeffs[i]:
        col_set.add(X_train.columns[i])
X_train = X_train[list(col_set | set(['longitude', 'latitude', 'host_id']))]
X_val = X_val[list(col_set | set(['longitude', 'latitude', 'host_id']))]
X_test = X_test[list(col_set | set(['longitude', 'latitude', 'host_id']))]

X = pd.concat([X_train, X_val, X_test], ignore_index=True)
y = np.concatenate([y_train, y_val, y_test])

X['id'] = np.arange(X.shape[0]) # longitude was already in col_set and we've added id column to be able to re-sort the data after the join

print(X.shape) 
X.head()

In [None]:
plt.hist(y, bins = 20) # listing price already logged
plt.show()

In [None]:
X[['latitude', 'longitude']] = X[['latitude', 'longitude']].round(2)
scaler = MinMaxScaler(feature_range=(-10, 10))
X[['latitude', 'longitude']] = scaler.fit_transform(X[['latitude', 'longitude']])

location_df = X.groupby(['latitude', 'longitude']).size().index.to_frame()
location_df['location'] = np.arange(location_df.shape[0])
location_df.head()

In [None]:
X = X.set_index(['latitude', 'longitude']).join(location_df[['location']]).reset_index().sort_values(by=['id']).drop(['id'], axis=1)
X.index = np.arange(X.shape[0])

In [None]:
print(len(X['location'].unique()))
print(X['location'].max())

In [None]:
coords = location_df[['latitude', 'longitude']].values
dist_matrix = squareform(pdist(coords)) ** 2
print(dist_matrix.shape)

In [None]:
X.rename(columns={'latitude': 'D1', 'longitude': 'D2', 'location': 'z0', 'host_id': 'z1'}, inplace=True)

# no colnames starting with z or D id using our functions
X.columns = [col.lower() if col not in ['D1', 'D2'] else col for col in X.columns]
print([col for col in X.columns if col.startswith('D')])
print([col for col in X.columns if col.startswith('z')])

In [None]:
mode = 'spatial_and_categoricals'
batch = 100
epochs = 500
patience = 10
qs = [len(X['z1'].unique())]
q_spatial = len(X['z0'].unique())
n_neurons = [10, 3]
dropout = []
activation = 'relu'
Z_non_linear = False
Z_embed_dim_pct = 10
n_sig2bs = 1
n_sig2bs_spatial = 2
est_cors = []
time2measure_dict = None
spatial_embed_neurons = None 
verbose = True
log_params = False
idx = None
shuffle = False
resolution = 100

In [None]:
def reg_nn(X_train, X_test, y_train, y_test, reg_type):
    start = time.time()
    if reg_type == 'ohe':
        y_pred, sigmas, _, _, n_epochs = reg_nn_ohe_or_ignore(
            X_train, X_test, y_train, y_test, qs, x_cols, batch, epochs, patience,
            n_neurons, dropout, activation, mode, n_sig2bs, n_sig2bs_spatial, est_cors, verbose)
    elif reg_type == 'lmm':
        y_pred, sigmas, _, _, n_epochs = reg_nn_lmm(
            X_train, X_test, y_train, y_test, qs, q_spatial, x_cols, batch, epochs, patience,
            n_neurons, dropout, activation, mode,
            n_sig2bs, n_sig2bs_spatial, est_cors, dist_matrix, spatial_embed_neurons,
            verbose, Z_non_linear, Z_embed_dim_pct, log_params, idx, shuffle)
    elif reg_type == 'ignore':
        y_pred, sigmas, _, _, n_epochs = reg_nn_ohe_or_ignore(
            X_train, X_test, y_train, y_test, qs, x_cols, batch, epochs, patience,
            n_neurons, dropout, activation, mode, n_sig2bs, n_sig2bs_spatial, est_cors, verbose, ignore_RE=True)
    elif reg_type == 'embed':
        y_pred, sigmas, _, _, n_epochs = reg_nn_embed(
            X_train, X_test, y_train, y_test, qs, q_spatial, x_cols, batch, epochs, patience,
            n_neurons, dropout, activation, mode, n_sig2bs, n_sig2bs_spatial, est_cors, verbose)
    elif reg_type == 'svdkl':
        y_pred, sigmas, _, _, n_epochs = reg_nn_svdkl(X_train, X_test, y_train, y_test, qs, x_cols, batch, epochs,
            patience, n_neurons, dropout, activation, mode, n_sig2bs, n_sig2bs_spatial, est_cors, verbose)
    elif reg_type == 'cnn':
        y_pred, sigmas, _, _, n_epochs = reg_nn_cnn(X_train, X_test, y_train, y_test, qs, x_cols, batch, epochs,
            patience, n_neurons, dropout, activation, mode,
            n_sig2bs, n_sig2bs_spatial, est_cors, resolution, verbose)
    else:
        raise ValueError(reg_type + 'is an unknown reg_type')
    end = time.time()
    K.clear_session()
    gc.collect()
    mse = np.mean((y_pred - y_test)**2)
    plt.scatter(y_test, y_pred, alpha = 0.5)
    plt.show()
    return mse, sigmas, n_epochs, end - start

In [None]:
res = pd.DataFrame(columns=['experiment', 'exp_type', 'mse', 'sigma_e_est', 'sigma_b0_est', 'sigma_b1_est', 'n_epoch', 'time'])
counter = 0

def iterate_reg_types(X_train, X_test, y_train, y_test):
    mse_lmm, sigmas, n_epochs_lmm, time_lmm = reg_nn(X_train, X_test, y_train, y_test, reg_type='lmm')
    print(' finished lmmnn, mse: %.4f' % (mse_lmm))
    mse_ohe, _, n_epochs_ohe, time_ohe = reg_nn(X_train, X_test, y_train, y_test, reg_type='ohe')
    print(' finished ohe, mse: %.4f' % (mse_ohe))
    mse_ig, _, n_epochs_ig, time_ig = reg_nn(X_train, X_test, y_train, y_test, reg_type='ignore')
    print(' finished ignore, mse: %.4f' % (mse_ig))
    mse_em, _, n_epochs_em, time_em = reg_nn(X_train, X_test, y_train, y_test, reg_type='embed')
    print(' finished embed, mse: %.4f' % (mse_em))
    res.loc[next(counter)] = [i, 'lmm', mse_lmm, sigmas[0], sigmas[2][0], sigmas[2][1], n_epochs_lmm, time_lmm]
    res.loc[next(counter)] = [i, 'ohe', mse_ohe, np.nan, np.nan, np.nan, n_epochs_ohe, time_ohe]
    res.loc[next(counter)] = [i, 'ignore', mse_ig, np.nan, np.nan, np.nan, n_epochs_ig, time_ig]
    res.loc[next(counter)] = [i, 'embed', mse_em, np.nan, np.nan, np.nan, n_epochs_em, time_em]

kf = KFold(n_splits=5, shuffle=True, random_state=42)
counter = Count().gen()

x_cols = [col for col in X.columns if col not in ['z0']]
x_cols_to_scale = [col for col in x_cols if col not in ['D1', 'D2']]

for i, (train_index, test_index) in enumerate(kf.split(X, y)):
    print('iteration %d' % i)
    X_train, X_test, y_train, y_test = X.loc[train_index].copy(), X.loc[test_index].copy(), y[train_index], y[test_index]
    y_train = pd.Series(y_train, index=X_train.index)
    y_test = pd.Series(y_test, index=X_test.index)
    iterate_reg_types(X_train, X_test, y_train, y_test)

In [None]:
res

In [None]:
res.to_csv('../../results/res_airbnb_spatial_and_categorical.csv')