In [1]:
import time
import datetime

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [12]:
df = pd.read_parquet("../dataset/etl/L3.regression_train-sel_all.parquet")
df.head()

Unnamed: 0,price,kamar_tidur,kamar_mandi,carport,sertifikat,kamar_pembantu,kamar_mandi_pembantu,dapur,jumlah_lantai,hadap,...,facility_mezzanine,facility_musholla,facility_one_gate_system,facility_parkir,facility_playground,facility_shed,facility_taman,facility_wastafel,facility_water_heater,facility_water_tank
0,5300.0,4.0,4.0,2.0,SHM - Sertifikat Hak Milik,1.0,1.0,2.0,2.0,Selatan,...,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
2,1200.0,4.0,4.0,2.0,HGB - Hak Guna Bangunan,0.0,0.0,1.0,2.0,Utara,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,4500.0,3.0,3.0,2.0,SHM - Sertifikat Hak Milik,3.0,1.0,1.0,3.0,Timur,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,3190.0,4.0,4.0,2.0,SHM - Sertifikat Hak Milik,1.0,1.0,2.0,2.0,Timur,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
6,380.0,2.0,2.0,1.0,HGB - Hak Guna Bangunan,0.0,0.0,1.0,1.0,Selatan,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18240 entries, 0 to 20145
Data columns (total 81 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   price                     18240 non-null  float64
 1   kamar_tidur               18240 non-null  float64
 2   kamar_mandi               18240 non-null  float64
 3   carport                   18240 non-null  float64
 4   sertifikat                18240 non-null  object 
 5   kamar_pembantu            18240 non-null  float64
 6   kamar_mandi_pembantu      18240 non-null  float64
 7   dapur                     18240 non-null  float64
 8   jumlah_lantai             18240 non-null  float64
 9   hadap                     18240 non-null  object 
 10  konsep_dan_gaya_rumah     18240 non-null  object 
 11  pemandangan               18240 non-null  object 
 12  tahun_dibangun            18240 non-null  float64
 13  tahun_di_renovasi         18240 non-null  float64
 14  sumber_air 

In [14]:
multihot_cols = []
multihot_cols.extend([col for col in df.columns if col.startswith("floor_mat_")])
multihot_cols.extend([col for col in df.columns if col.startswith("house_mat_")])
multihot_cols.extend([col for col in df.columns if col.startswith("facility_")])
multihot_cols.extend([col for col in df.columns if col.startswith("tags_")])

extra_tags = ["ruang_tamu", "ruang_makan", "terjangkau_internet", "hook"]
for tag in extra_tags:
    if tag in df.columns:
        multihot_cols.append(tag)

cat_cols = [col for col in df.select_dtypes(include=["object"]).columns]
num_cols = list(set(df.columns) - set(multihot_cols + cat_cols + ["price"]))

In [15]:
X = df.drop(columns=["price"])
y = np.log(df["price"].values)

## Training Regression Models

In [16]:
# to hold CV results
cv_results = []

### Scikit-Learn

In [17]:
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [18]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_validate, KFold
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score

In [19]:
def cross_validate_ex(model, X, y, category, name):
    # create cross-validation param
    cv = KFold(n_splits=10, shuffle=True, random_state=21)

    # define scoring
    scoring = ["r2", "neg_mean_squared_error", "neg_mean_absolute_error", "neg_mean_absolute_percentage_error"]

    # cross-validate
    scores = cross_validate(model, X, y, cv=cv, scoring=scoring, n_jobs=4, verbose=1)

    # change into record-wise
    score_records = []
    for i in range(10):
        score_records.append({
            "fit_time": scores["fit_time"][i],
            "score_time": scores["score_time"][i],
            "r2": scores["test_r2"][i],
            "mse": -scores["test_neg_mean_squared_error"][i],
            "mae": -scores["test_neg_mean_absolute_error"][i],
            "mape": -scores["test_neg_mean_absolute_percentage_error"][i],
            "category": category,
            "name": name
        })
    
    return score_records

In [20]:
categorical_encoder = Pipeline(
  steps=[
    ("encoder", OneHotEncoder(handle_unknown="ignore")),
  ]
)

numerical_encoder = Pipeline(
    steps=[
        ("scaler", MinMaxScaler()),
    ]
)

compose_transformers = ColumnTransformer(
    transformers=[
        ("passthrough", "passthrough", multihot_cols),
        ("catergorical_encoder", categorical_encoder, cat_cols),
        ("numerical_encoder", numerical_encoder, num_cols),
    ]
)

compose_transformers

In [11]:
from sklearn.svm import SVR, LinearSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, BayesianRidge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

In [None]:
# define models
models = [
    ("Linear", "LinearRegression", LinearRegression()),
    ("Linear", "Lasso", Lasso()),
    ("Linear", "Ridge", Ridge()), 
    ("Linear", "BayesianRidge", BayesianRidge()),
    ("Tree", "DecisionTreeRegressor", DecisionTreeRegressor()),
    ("KNN", "KNeighborsRegressor", KNeighborsRegressor()),
    ("SVM", "SVR", SVR()),
    ("SVM", "LinearSVR", LinearSVR()),
    ("Neural Network", "MLPRegressor", MLPRegressor()),
    ("Ensemble", "RandomForestRegressor", RandomForestRegressor()),
    ("Ensemble", "GradientBoostingRegressor", GradientBoostingRegressor()),
]

# evaluate each model
for category, name, model in models:
    print(f"Evaluating {category}/{name} model")

    # create classifier pipeline
    clf = Pipeline(
        steps=[
            ("preprocessor", compose_transformers),
            ("regressor", model),
        ]
    )

    # run cross validation
    cv_results.extend(cross_validate_ex(clf, X, y, category, name))

# with pd.option_context('display.float_format', '{:0.4f}'.format):

### Catboost Models

In [None]:
from catboost import CatBoostRegressor, Pool

In [12]:
X_trans = compose_transformers.fit_transform(X)

In [None]:
# run training
cv = KFold(n_splits=10, shuffle=True, random_state=21)
for fold_i, (train_idx, test_idx) in enumerate(cv.split(X_trans, y)):
    print(f"Training fold {fold_i + 1}")

    # split data
    X_train, X_test = X_trans[train_idx], X_trans[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # create pool
    train_pool = Pool(data=X_train, label=y_train, cat_features=cat_cols)
    test_pool = Pool(data=X_test, label=y_test, cat_features=cat_cols)
    
    # train model
    model = CatBoostRegressor(verbose=0, random_seed=21, task_type="GPU")

    fit_time_start = time.time()
    model.fit(train_pool, eval_set=test_pool, verbose=0)
    fit_time_end = time.time()

    # run predictions
    score_time_start = time.time()
    y_pred = model.predict(test_pool)
    score_time_end = time.time()

    # store metrics
    cv_results.append({
        "fit_time": fit_time_end - fit_time_start,
        "score_time": score_time_end - score_time_start,
        "r2": r2_score(y_test, y_pred),
        "mse": mean_squared_error(y_test, y_pred),
        "mae": mean_absolute_error(y_test, y_pred),
        "mape": mean_absolute_percentage_error(y_test, y_pred),
        "category": "CatBoost",
        "name": "CatBoostRegressor without Embedding",
    })

### TensorFlow Models

In [15]:
import tensorflow as tf

2024-02-09 17:29:18.629373: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-09 17:29:18.629458: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-09 17:29:18.629488: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-09 17:29:18.651063: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [16]:
tf.config.list_physical_devices()

2024-02-09 17:29:20.494602: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-02-09 17:29:20.511962: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-02-09 17:29:20.512063: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.


[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [17]:
def construct_tf_dataset(X, y):
    # create dataset
    ds_labels = tf.data.Dataset.from_tensor_slices(y)
    ds_features = tf.data.Dataset.from_tensor_slices(X)

    return tf.data.Dataset.zip((ds_features, ds_labels))\
        .batch(128) \
        .cache() \
        .prefetch(tf.data.AUTOTUNE)

In [18]:
xxxx = construct_tf_dataset(X_trans, y)

2024-02-09 17:29:20.532511: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-02-09 17:29:20.532725: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-02-09 17:29:20.532790: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-02-09 17:29:20.698702: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-02-09 17:29:20.698938: I tensorflow/compile

64

In [None]:
def create_tf_model() -> tf.keras.Model:
    # create model
    input = tf.keras.layers.Input((X_trans.shape[1],))
    x = tf.keras.layers.Dense(512, activation="gelu")(input)
    x = tf.keras.layers.Dense(256, activation="gelu")(x)
    x = tf.keras.layers.Dense(128, activation="gelu")(x)
    x = tf.keras.layers.Dense(64, activation="gelu")(x)
    output = tf.keras.layers.Dense(1, name="price")(x)

    # create model
    model = tf.keras.Model(inputs=input, outputs=output)
    return model

In [None]:
# run training
cv = KFold(n_splits=10, shuffle=True, random_state=21)
for fold_i, (train_idx, test_idx) in enumerate(cv.split(X_trans, y)):
    print(f"Training fold {fold_i + 1}")

    # split data
    X_train, X_test = X_trans[train_idx], X_trans[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # create dataset
    train_ds = construct_tf_dataset(X_train, y_train)
    test_ds = construct_tf_dataset(X_test, y_test)
    print(train_ds)

    # create model
    model = create_tf_model()
    # model.summary()
    # tf.keras.utils.plot_model(model, show_shapes=True, show_trainable=True, show_dtype=True, rankdir="LR")

    # compile model
    model.compile(optimizer="adam", loss="mean_squared_error", metrics=["mae", "mse"])

    # create early stopping callback
    early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

    # create tensorboard callback
    log_dir = f"logs/fit/fold_{fold_i + 1}" #+ datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

    # train model
    fit_time_start = time.time()
    model.fit(train_ds, epochs=200, validation_data=test_ds, callbacks=[early_stopping_callback, tensorboard_callback])
    fit_time_end = time.time()

    # run predictions
    score_time_start = time.time()
    y_pred = model.predict(test_ds).reshape(-1)
    score_time_end = time.time()

    # store metrics
    cv_results.append({
        "fit_time": fit_time_end - fit_time_start,
        "score_time": score_time_end - score_time_start,
        "r2": r2_score(y_test, y_pred),
        "mse": mean_squared_error(y_test, y_pred),
        "mae": mean_absolute_error(y_test, y_pred),
        "mape": mean_absolute_percentage_error(y_test, y_pred),
        "category": "TensorFlow",
        "name": "DNNRegressor",
    })

## Summary

In [None]:
df_scores = pd.DataFrame(cv_results)
df_scores.head()

In [None]:
rdf = df_scores.pivot_table(index=["category", "name"], values=["r2", "mse", "mae", "mape"]).round(4).sort_values(by="mse", ascending=True)
rdf.style.background_gradient()

In [None]:
rdf.to_csv("../dataset/train_baseline/summary_regression_scores.csv", index=True)
df_scores.to_csv("../dataset/train_baseline/raw_regression_scores.csv", index=False)