In [1]:
import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from preprocessing import PreprocessorBase
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, mean_absolute_error, r2_score

2023-12-27 08:18:38.405300: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Basic Preprocesing

In [2]:
cleaner = PreprocessorBase()

[nltk_data] Downloading package punkt to /home/fahmi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [26]:
df = pd.read_json("../dataset/houses-9k.json", lines=True)
df.head(2)

Unnamed: 0,id,price,installment,address,tags,description,specs,facilities,agent,images,url
0,hos13819879,295000.0,21,"Sentul City, Bogor","[Cash Keras, One Gate System]","Rumah siap masuk dengan style englang house,be...","{'Kamar Tidur': '3', 'Kamar Mandi': '4', 'Luas...","[Jalur Telepon, Taman, Tempat Jemuran, Akses P...","{'name': 'Heri Andrian', 'url': 'https://www.r...",[https://picture.rumah123.com/r123-images/720x...,https://www.rumah123.com/properti/bogor/hos138...
1,hos13868613,11000.0,7,"Bogor Barat, Bogor","[Bisa Nego, Cash Keras/KPR]","Dijual rumah seken terawat, bebas banjir, loka...","{'Kamar Tidur': '2', 'Kamar Mandi': '1', 'Luas...","[Keamanan, Taman, Tempat Jemuran, Keamanan 24 ...","{'name': 'PASHOUSES ID', 'url': 'https://www.r...",[https://picture.rumah123.com/r123-images/720x...,https://www.rumah123.com/properti/bogor/hos138...


In [27]:
df["kecamatan"] = cleaner.derive_kecamatan(df)
df = cleaner.transform_dict_col(df, "specs", "spec_")
df = cleaner.transform_list_col(df, "tags", return_embeddings=False, prefix="tag_")
df = cleaner.transform_list_col(df, "facilities", return_embeddings=False, prefix="fac_", split_fun=cleaner.dedupe_facilities)
df = cleaner.infer_spec_cols(df, prefix="spec_")

df = df.drop(columns=['id', "tags", "facilities", 'images', 'installment', 'address', 'description', 'specs', 'agent', 'url', 'spec_id_iklan'])

In [28]:
(df.isna().sum() / len(df)).sort_values(ascending=False)

  (df.isna().sum() / len(df)).sort_values(ascending=False)


spec_tahun_di_renovasi       0.823256
spec_kamar_mandi_pembantu    0.760473
spec_kamar_pembantu          0.724329
spec_garasi                  0.684094
spec_hadap                   0.612910
                               ...   
fac_CCTV                     0.000000
fac_DAPUR                    0.000000
fac_GAS                      0.000000
fac_GROUND_FLOOR             0.000000
fac_WORKSHOP                 0.000000
Length: 79, dtype: Sparse[float64, 0.0]

In [29]:
df = cleaner.drop_features_by_nan(df, threshold=0.3)
(df.isna().sum() / len(df)).sort_values(ascending=False)

  return df.dropna(axis=1, thresh=threshold * len(df))


spec_garasi                   0.684094
spec_hadap                    0.612910
spec_tahun_dibangun           0.585830
spec_material_bangunan        0.525810
spec_material_lantai          0.501382
spec_konsep_dan_gaya_rumah    0.489997
spec_pemandangan              0.447110
spec_lebar_jalan              0.397369
spec_dapur                    0.394938
spec_carport                  0.352714
spec_sumber_air               0.312369
spec_kondisi_perabotan        0.287056
spec_ruang_makan              0.199072
spec_hook                     0.138499
spec_terjangkau_internet      0.138388
spec_ruang_tamu               0.137725
spec_kondisi_properti         0.128551
spec_jumlah_lantai            0.116724
spec_daya_listrik             0.094506
spec_kamar_tidur              0.025644
spec_kamar_mandi              0.023875
spec_sertifikat               0.002763
spec_luas_bangunan            0.001658
spec_luas_tanah               0.000111
price                         0.000111
kecamatan                

In [18]:
FILLNA_CAT_COLS = df.select_dtypes(include=['object']).columns.tolist()
df = df.fillna({k: "[UNK]" for k in FILLNA_CAT_COLS})

FILLNA_NUM_COLS = df.select_dtypes(exclude=['object']).columns.tolist()
df = df.fillna({k: df[k].mean() for k in FILLNA_NUM_COLS})

df["price"] = np.log(df["price"])

df.head(2)

  df.isna().sum() / len(df)


price               0.0
kecamatan           0.0
spec_kamar_tidur    0.0
spec_kamar_mandi    0.0
spec_luas_tanah     0.0
                   ... 
fac_TRACK_LARI      0.0
fac_WASTAFEL        0.0
fac_WATER_HEATER    0.0
fac_WATER_TANK      0.0
fac_WORKSHOP        0.0
Length: 79, dtype: Sparse[float64, 0.0]

## Dataset Building

In [None]:
df_clean = df[
  [
  'price', 
  'tags', 
  'facilities', 
  'kecamatan', 
  'spec_kamar_tidur', 
  'spec_kamar_mandi', 
  'spec_luas_tanah', 
  'spec_luas_bangunan', 
#   'spec_tipe_properti', 
#   'spec_sertifikat', 
  'spec_daya_listrik', 
  'spec_ruang_makan', 
  'spec_ruang_tamu', 
#   'spec_kondisi_perabotan',
  'spec_jumlah_lantai', 
  'spec_terjangkau_internet', 
#   'spec_hook', 
#   'spec_kondisi_properti'
  ]
]

df_clean.sample(1)

In [None]:
X = df.drop(columns=["price"])
y = df["price"]

X["tags"] = np.array([np.array(x) for x in df["tags"].values])
X["facilities"] = np.array([np.array(x) for x in df["facilities"].values])

# split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=42)

In [None]:
X.head(2)

In [None]:
# encode numerical features
num_cols = set(df_clean.select_dtypes(include=['float64']).columns.tolist()) - set(["price"])
num_transformer = Pipeline(
   steps=[("scaler", StandardScaler())]
)

# encode categorical features
cat_cols = set(df_clean.select_dtypes(include=['object']).columns.tolist()) - set(["tags", "facilities"])
cat_transformer = Pipeline(
    steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))]
)

# input for multi-hot encoded features
mul_cols = ["tags", "facilities"]
mul_transformer = Pipeline(
    steps=[("pca", PCA(n_components=2))]
)

# combine all features
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, list(num_cols)),
        ("cat", cat_transformer, list(cat_cols)),
        ("mul", mul_transformer, list(mul_cols)),
    ],
    remainder="passthrough",
)

In [None]:
preprocessor.fit_transform(X_test)

## Modelling

In [None]:
# hidden layer for embedding
categorical_layer = tf.keras.layers.concatenate(cat_features)
categorical_layer = tf.keras.layers.Dense(128, activation="relu")(categorical_layer)
categorical_layer = tf.keras.layers.Dense(64, activation="relu")(categorical_layer)

# hidden layer for multi-hot encoded features
multi_hot_layer = tf.keras.layers.concatenate(mul_features)
multi_hot_layer = tf.keras.layers.Dense(128, activation="relu")(multi_hot_layer)
multi_hot_layer = tf.keras.layers.Dense(64, activation="relu")(multi_hot_layer)

# hidden layer for all features
all_features = tf.keras.layers.concatenate([categorical_layer, multi_hot_layer] + num_features)
x = tf.keras.layers.Dense(256, activation="relu")(all_features)
x = tf.keras.layers.Dense(256, activation="relu")(x)
x = tf.keras.layers.Dense(128, activation="relu")(x)
x = tf.keras.layers.Dropout(0.2)(x)
output = tf.keras.layers.Dense(1)(x)

# create model
model = tf.keras.Model(all_inputs, output)
model.summary()

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(0.001), loss=tf.keras.losses.MeanSquaredError(), metrics=[tf.keras.metrics.MeanAbsoluteError(), tf.keras.metrics.R2Score()])

In [None]:
tf.keras.utils.plot_model(model, rankdir="LR")

In [None]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=0)

H = model.fit(ds_train, epochs=100, validation_data=ds_test, callbacks=[tensorboard_callback])

In [None]:
epochs = range(1, len(H.history["loss"]) + 1)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))
ax1.plot(epochs, H.history["mean_absolute_error"], label='Training MAE')
ax1.plot(epochs, H.history["val_mean_absolute_error"], label='Validation MAE')
ax1.set_title('Training and validation MAE')
ax1.legend()

ax2.plot(epochs, H.history["loss"], label='Training loss')
ax2.plot(epochs, H.history["val_loss"], label='Validation loss')
ax2.set_title('Training and validation loss')
ax2.legend()

fig.tight_layout()
plt.show()

In [None]:
y_pred = model.predict(ds_test)
y_test = df_test["price"].values.reshape(-1, 1)

print("R2", r2_score(y_test, y_pred))
print("MSE", mean_squared_error(y_test, y_pred))
print("RMSE", np.sqrt(mean_squared_error(y_test, y_pred)))
print("MAE", mean_absolute_error(y_test, y_pred))
print("MAPE", mean_absolute_percentage_error(y_test, y_pred))

pd.DataFrame({
  "Data": ["Actual", "Prediction", "Train"],
  "Means": [y_test.mean(), y_pred.mean(), df_train["price"].mean()],
  "Stddev": [y_test.std(), y_pred.std(), df_train["price"].std()],
  "Var": [y_test.var(), y_pred.var(), df_train["price"].var()]
})

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(10, 4))

ax1.hist(y_test, bins=50)
ax1.set_title("Actual")

ax2.hist(y_pred, bins=50)
ax2.set_title("Predicted")

y_train = df_train["price"].values.reshape(-1, 1)
ax3.hist(y_train, bins=50)
ax3.set_title("Train")

plt.show()

In [None]:
plt.figure(figsize=(10, 5))
plt.scatter(y_test, y_pred)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.axis("tight")