In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearnex import patch_sklearn
patch_sklearn()

from sklearn.preprocessing import StandardScaler

import tensorflow as tf

from experiments.helpers import plot_all, print_metrics, plot_loss

## Data Loading and Feature Engineering

In [None]:
df_train = pd.read_csv('./dataset/house_price_train.csv', parse_dates=["Time"])
df_test = pd.read_csv('./dataset/house_price_test.csv', parse_dates=["Time"])

In [None]:
cat_columns = ["City", "District", "Street", "Community", "Floor"] # ["City", "Floor"]
num_columns = ["#Floors", "#Rooms", "#Halls", "Area"]

In [None]:
# drop unused columns
df_train_pool = df_train.drop(columns=["Id", "Time", "Orient", "Lat", "Lon"])
df_test_pool = df_train.drop(columns=["Id", "Time", "Orient", "Lat", "Lon"])

In [None]:
# standardize price
scaler = StandardScaler()
scaler.fit(df_train_pool["Price"].values.reshape(-1, 1))

y_train = scaler.transform(df_train_pool["Price"].values.reshape(-1, 1))
y_test = scaler.transform(df_test_pool["Price"].values.reshape(-1, 1))

In [None]:
X_train_num = df_train_pool[num_columns].values
X_test_num = df_test_pool[num_columns].values

### Deep Learning based model using TensorFlow

In [None]:
# baseline model
tf.random.set_seed(42)

model_inputs = []
models_intermediate = []

# create categorical embedding model
categorical_models = []
for col in cat_columns:
    vocab = list(df_train_pool[col].unique())
    vocab_size = np.minimum(1000, df_train_pool[col].nunique() + 1)
    print(f"{col} vocab size: {vocab_size}")

    cat_input = tf.keras.layers.Input(shape=(1,), name=f"cat_{col}", dtype=tf.string)
    cat_lookup = tf.keras.layers.StringLookup(vocabulary=vocab, mask_token=None, num_oov_indices=0)(cat_input)
    cat_embedding = tf.keras.layers.Embedding(vocab_size, 200)(cat_lookup)
    cat_flatten = tf.keras.layers.Flatten()(cat_embedding)
    
    model_inputs.append(cat_input)
    categorical_models.append(cat_flatten)

# create numerical input
num_normalizer = tf.keras.layers.Normalization()
num_normalizer.adapt(X_train_num)

numerical_input = tf.keras.layers.Input(shape=(4,), name="num_input")
numerical_normalize_layer = num_normalizer(numerical_input)

model_inputs.append(numerical_input)

# merge all inputs
cat_num_concat = tf.keras.layers.concatenate([*categorical_models, numerical_normalize_layer])

# reshape to 3D for RNN
intermediate_reshape_layer = tf.keras.layers.Reshape(target_shape=(1, 4 + len(categorical_models) * 200))(cat_num_concat)

# create LSTM layer
lstm1_layer = tf.keras.layers.LSTM(64, return_sequences=True)(intermediate_reshape_layer)
lstm2_layer = tf.keras.layers.LSTM(64)(lstm1_layer)

# create output layer
output_layer = tf.keras.layers.Dense(1)(lstm2_layer)

# create final model
baseline_model = tf.keras.Model(inputs=model_inputs, outputs=output_layer)

In [None]:
baseline_model.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.MeanSquaredError(), metrics=[tf.keras.metrics.MeanAbsoluteError()])

In [None]:
tf.keras.utils.plot_model(baseline_model, show_shapes=True)

In [None]:
baseline_model.summary()

In [None]:
log_dir = "logs/fit/baseline_lstm_akhir"
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

train_inputs = {"num_input": X_train_num}
for col in cat_columns:
    train_inputs[f"cat_{col}"] = df_train_pool[col].values.reshape(-1, 1)

history = baseline_model.fit(train_inputs, y_train, validation_split=0.15, epochs=20, callbacks=[tensorboard_callback])

## Evaluation

In [None]:
test_inputs = {"num_input": X_test_num}
for col in cat_columns:
    test_inputs[f"cat_{col}"] = df_test_pool[col].values.reshape(-1, 1)

predicted = baseline_model.predict(test_inputs)
predicted_unscaled = scaler.inverse_transform(predicted.reshape(-1, 1))
y_true_unscaled = scaler.inverse_transform(y_test.reshape(-1, 1))

print_metrics(y_true_unscaled, predicted_unscaled)

In [None]:
plot_loss(history)
plt.show()

In [None]:
plot_all(y_true_unscaled, predicted_unscaled)
plt.show()